Ensure that register state is saved before it's used.

Without this there is a race where a physical CPU is still saving the
contents of registers into a vCPU buffer while another physical CPU
is already trying to restore the state.

Also adding test case that fails without this fix and succeeds with it.

Change-Id: I0badfd61f12385a3d8dc8d09eb298cfe56f2a415
diff --git a/inc/hf/cpu.h b/inc/hf/cpu.h
index 76d71d6..b477dec 100644
--- a/inc/hf/cpu.h
+++ b/inc/hf/cpu.h
@@ -54,6 +54,11 @@
 	uint32_t interrupt_pending[HF_NUM_INTIDS / INTERRUPT_REGISTER_BITS];
 };
 
+struct retval_state {
+	uintptr_t value;
+	bool force;
+};
+
 struct vcpu {
 	struct spinlock lock;
 	enum vcpu_state state;
@@ -62,6 +67,20 @@
 	struct vcpu *mailbox_next;
 	struct arch_regs regs;
 	struct interrupts interrupts;
+
+	/*
+	 * The following field is used to force a return value to be set the
+	 * next time a vCPU belonging to a secondary VM runs. For primary VMs,
+	 * 'regs' can be set directly.
+	 */
+	struct retval_state retval;
+
+	/*
+	 * Determine whether the 'regs' field is available for use. This is set
+	 * to false when a vCPU is about to run on a physical CPU, and is set
+	 * back to true when it is descheduled.
+	 */
+	bool regs_available;
 };
 
 /* TODO: Update alignment such that cpus are in different cache lines. */
diff --git a/kokoro/ubuntu/test.sh b/kokoro/ubuntu/test.sh
index be49777..14f67e4 100755
--- a/kokoro/ubuntu/test.sh
+++ b/kokoro/ubuntu/test.sh
@@ -44,3 +44,4 @@
 $HFTEST hafnium --initrd gicv3_test
 $HFTEST hafnium --initrd primary_only_test
 $HFTEST hafnium --initrd primary_with_secondaries_test
+$HFTEST hafnium --initrd primary_with_state_secondary_test
diff --git a/src/api.c b/src/api.c
index 4ab7416..6be8702 100644
--- a/src/api.c
+++ b/src/api.c
@@ -129,6 +129,66 @@
 }
 
 /**
+ * This function is called by the architecture-specific context switching
+ * function to indicate that register state for the given vcpu has been saved
+ * and can therefore be used by other pcpus.
+ */
+void api_regs_state_saved(struct vcpu *vcpu)
+{
+	sl_lock(&vcpu->lock);
+	vcpu->regs_available = true;
+	sl_unlock(&vcpu->lock);
+}
+
+/**
+ * Prepares the vcpu to run by updating its state and fetching whether a return
+ * value needs to be forced onto the vCPU.
+ */
+static bool api_vcpu_prepare_run(const struct vcpu *current, struct vcpu *vcpu,
+				 struct retval_state *vcpu_retval)
+{
+	bool ret;
+
+	sl_lock(&vcpu->lock);
+	if (vcpu->state != vcpu_state_ready) {
+		ret = false;
+		goto out;
+	}
+
+	vcpu->cpu = current->cpu;
+	vcpu->state = vcpu_state_running;
+
+	/* Fetch return value to inject into vCPU if there is one. */
+	*vcpu_retval = vcpu->retval;
+	if (vcpu_retval->force) {
+		vcpu->retval.force = false;
+	}
+
+	/*
+	 * Wait until the registers become available. Care must be taken when
+	 * looping on this: it shouldn't be done while holding other locks
+	 * to avoid deadlocks.
+	 */
+	while (!vcpu->regs_available) {
+		sl_unlock(&vcpu->lock);
+		sl_lock(&vcpu->lock);
+	}
+
+	/*
+	 * Mark the registers as unavailable now that we're about to reflect
+	 * them onto the real registers. This will also prevent another physical
+	 * CPU from trying to read these registers.
+	 */
+	vcpu->regs_available = false;
+
+	ret = true;
+
+out:
+	sl_unlock(&vcpu->lock);
+	return ret;
+}
+
+/**
  * Runs the given vcpu of the given vm.
  */
 struct hf_vcpu_run_return api_vcpu_run(uint32_t vm_id, uint32_t vcpu_idx,
@@ -137,6 +197,7 @@
 {
 	struct vm *vm;
 	struct vcpu *vcpu;
+	struct retval_state vcpu_retval;
 	struct hf_vcpu_run_return ret = {
 		.code = HF_VCPU_RUN_WAIT_FOR_INTERRUPT,
 	};
@@ -162,18 +223,20 @@
 		goto out;
 	}
 
+	/* Update state if allowed. */
 	vcpu = &vm->vcpus[vcpu_idx];
-
-	sl_lock(&vcpu->lock);
-	if (vcpu->state != vcpu_state_ready) {
+	if (!api_vcpu_prepare_run(current, vcpu, &vcpu_retval)) {
 		ret.code = HF_VCPU_RUN_WAIT_FOR_INTERRUPT;
-	} else {
-		vcpu->cpu = current->cpu;
-		vcpu->state = vcpu_state_running;
-		*next = vcpu;
-		ret.code = HF_VCPU_RUN_YIELD;
+		goto out;
 	}
-	sl_unlock(&vcpu->lock);
+
+	*next = vcpu;
+	ret.code = HF_VCPU_RUN_YIELD;
+
+	/* Update return value if one was injected. */
+	if (vcpu_retval.force) {
+		arch_regs_set_retval(&vcpu->regs, vcpu_retval.value);
+	}
 
 out:
 	return ret;
@@ -353,12 +416,12 @@
 		to_vcpu->state = vcpu_state_ready;
 
 		/* Return from HF_MAILBOX_RECEIVE. */
-		arch_regs_set_retval(&to_vcpu->regs,
-				     hf_mailbox_receive_return_encode((
-					     struct hf_mailbox_receive_return){
-					     .vm_id = to->mailbox.recv_from_id,
-					     .size = size,
-				     }));
+		to_vcpu->retval.force = true;
+		to_vcpu->retval.value = hf_mailbox_receive_return_encode(
+			(struct hf_mailbox_receive_return){
+				.vm_id = to->mailbox.recv_from_id,
+				.size = size,
+			});
 
 		sl_unlock(&to_vcpu->lock);
 
diff --git a/src/arch/aarch64/exceptions.S b/src/arch/aarch64/exceptions.S
index db0cec0..0f06bdd 100644
--- a/src/arch/aarch64/exceptions.S
+++ b/src/arch/aarch64/exceptions.S
@@ -308,8 +308,15 @@
 	mrs x26, vttbr_el2
 	str x26, [x1, #VCPU_LAZY + 16 * 13]
 
-	/* Intentional fallthrough. */
+	/* Save new vcpu pointer in non-volatile register. */
+	mov x19, x0
 
+	/* Inform the arch-independent sections that regs have been saved. */
+	mov x0, x1
+	bl api_regs_state_saved
+	mov x0, x19
+
+	/* Intentional fallthrough. */
 .globl vcpu_restore_all_and_run
 vcpu_restore_all_and_run:
 	/* Update pointer to current vcpu. */
diff --git a/src/arch/aarch64/hftest/BUILD.gn b/src/arch/aarch64/hftest/BUILD.gn
index dca75b0..b8fdc48 100644
--- a/src/arch/aarch64/hftest/BUILD.gn
+++ b/src/arch/aarch64/hftest/BUILD.gn
@@ -30,6 +30,7 @@
 
 # Shutdown the system or exit emulation, start/stop CPUs.
 source_set("power_mgmt") {
+  testonly = true
   sources = [
     "cpu_entry.S",
     "power_mgmt.c",
@@ -40,7 +41,7 @@
   ]
 }
 
-# Exception handlers for interrupts
+# Exception handlers for interrupts and gicv3 el1 driver.
 source_set("interrupts_gicv3") {
   testonly = true
   sources = [
@@ -48,3 +49,11 @@
     "interrupts_gicv3.c",
   ]
 }
+
+# Get/set CPU state.
+source_set("state") {
+  testonly = true
+  sources = [
+    "state.c",
+  ]
+}
diff --git a/src/arch/aarch64/hftest/state.c b/src/arch/aarch64/hftest/state.c
new file mode 100644
index 0000000..219869a
--- /dev/null
+++ b/src/arch/aarch64/hftest/state.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hf/arch/vm/state.h"
+
+#include "../msr.h"
+
+void per_cpu_ptr_set(uintptr_t v)
+{
+	write_msr(tpidr_el1, v);
+}
+
+uintptr_t per_cpu_ptr_get(void)
+{
+	return read_msr(tpidr_el1);
+}
diff --git a/src/arch/aarch64/inc/hf/arch/cpu.h b/src/arch/aarch64/inc/hf/arch/cpu.h
index 175551f..11f07d7 100644
--- a/src/arch/aarch64/inc/hf/arch/cpu.h
+++ b/src/arch/aarch64/inc/hf/arch/cpu.h
@@ -73,8 +73,7 @@
 }
 
 static inline void arch_regs_init(struct arch_regs *r, bool is_primary,
-				  uint64_t vmid, paddr_t table, ipaddr_t pc,
-				  uintreg_t arg)
+				  uint64_t vmid, paddr_t table, uint32_t index)
 {
 	uintreg_t hcr;
 	uintreg_t cptr;
@@ -108,18 +107,32 @@
 	r->lazy.cptr_el2 = cptr;
 	r->lazy.cnthctl_el2 = cnthctl;
 	r->lazy.vttbr_el2 = pa_addr(table) | (vmid << 48);
+	r->lazy.vmpidr_el2 = index;
 	/* TODO: Use constant here. */
 	r->spsr = 5 |	 /* M bits, set to EL1h. */
 		  (0xf << 6); /* DAIF bits set; disable interrupts. */
+}
+
+/**
+ * Updates the given registers so that when a vcpu runs, it starts off at the
+ * given address (pc) with the given argument.
+ *
+ * This function must only be called on an arch_regs that is known not be in use
+ * by any other physical CPU.
+ */
+static inline void arch_regs_set_pc_arg(struct arch_regs *r, ipaddr_t pc,
+					uintreg_t arg)
+{
 	r->pc = ipa_addr(pc);
 	r->r[0] = arg;
 }
 
-static inline void arch_regs_set_vcpu_index(struct arch_regs *r, uint32_t index)
-{
-	r->lazy.vmpidr_el2 = index;
-}
-
+/**
+ * Updates the register holding the return value of a function.
+ *
+ * This function must only be called on an arch_regs that is known not be in use
+ * by any other physical CPU.
+ */
 static inline void arch_regs_set_retval(struct arch_regs *r, uintreg_t v)
 {
 	r->r[0] = v;
diff --git a/src/arch/aarch64/inc/hf/arch/vm/state.h b/src/arch/aarch64/inc/hf/arch/vm/state.h
new file mode 100644
index 0000000..9eb4a51
--- /dev/null
+++ b/src/arch/aarch64/inc/hf/arch/vm/state.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+void per_cpu_ptr_set(uintptr_t v);
+uintptr_t per_cpu_ptr_get(void);
diff --git a/src/arch/fake/inc/hf/arch/cpu.h b/src/arch/fake/inc/hf/arch/cpu.h
index 8cac590..cd8a47c 100644
--- a/src/arch/fake/inc/hf/arch/cpu.h
+++ b/src/arch/fake/inc/hf/arch/cpu.h
@@ -40,20 +40,20 @@
 }
 
 static inline void arch_regs_init(struct arch_regs *r, bool is_primary,
-				  uint64_t vmid, paddr_t table, ipaddr_t pc,
-				  uintreg_t arg)
+				  uint64_t vmid, paddr_t table, uint64_t index)
 {
 	/* TODO */
 	(void)is_primary;
 	(void)vmid;
 	(void)table;
-	(void)pc;
-	r->r[0] = arg;
+	r->vcpu_index = index;
 }
 
-static inline void arch_regs_set_vcpu_index(struct arch_regs *r, uint16_t index)
+static inline void arch_regs_set_pc_arg(struct arch_regs *r, ipaddr_t pc,
+					uintreg_t arg)
 {
-	r->vcpu_index = index;
+	(void)pc;
+	r->r[0] = arg;
 }
 
 static inline void arch_regs_set_retval(struct arch_regs *r, uintreg_t v)
diff --git a/src/cpu.c b/src/cpu.c
index 0f2bbba..1ae6cd1 100644
--- a/src/cpu.c
+++ b/src/cpu.c
@@ -96,8 +96,7 @@
 	if (!prev) {
 		struct vm *vm = vm_get(HF_PRIMARY_VM_ID);
 		struct vcpu *vcpu = &vm->vcpus[cpu_index(c)];
-		arch_regs_init(&vcpu->regs, true, vm->id, vm->ptable.root,
-			       entry, arg);
+		arch_regs_set_pc_arg(&vcpu->regs, entry, arg);
 		vcpu_on(vcpu);
 	}
 
@@ -134,9 +133,11 @@
 {
 	memset(vcpu, 0, sizeof(*vcpu));
 	sl_init(&vcpu->lock);
+	vcpu->regs_available = true;
 	vcpu->vm = vm;
 	vcpu->state = vcpu_state_off;
-	arch_regs_set_vcpu_index(&vcpu->regs, vcpu - vm->vcpus);
+	arch_regs_init(&vcpu->regs, vm->id == HF_PRIMARY_VM_ID, vm->id,
+		       vm->ptable.root, vcpu_index(vcpu));
 }
 
 void vcpu_on(struct vcpu *vcpu)
diff --git a/src/vm.c b/src/vm.c
index f1fcbb6..4a1d9d7 100644
--- a/src/vm.c
+++ b/src/vm.c
@@ -42,6 +42,10 @@
 	vm->vcpu_count = vcpu_count;
 	vm->mailbox.state = mailbox_state_empty;
 
+	if (!mm_ptable_init(&vm->ptable, 0, ppool)) {
+		return false;
+	}
+
 	/* Do basic initialization of vcpus. */
 	for (i = 0; i < vcpu_count; i++) {
 		vcpu_init(&vm->vcpus[i], vm);
@@ -50,7 +54,7 @@
 	++vm_count;
 	*new_vm = vm;
 
-	return mm_ptable_init(&vm->ptable, 0, ppool);
+	return true;
 }
 
 uint32_t vm_get_count(void)
@@ -73,8 +77,7 @@
 {
 	struct vcpu *vcpu = &vm->vcpus[index];
 	if (index < vm->vcpu_count) {
-		arch_regs_init(&vcpu->regs, vm->id == HF_PRIMARY_VM_ID, vm->id,
-			       vm->ptable.root, entry, arg);
+		arch_regs_set_pc_arg(&vcpu->regs, entry, arg);
 		vcpu_on(vcpu);
 	}
 }
diff --git a/test/vmapi/BUILD.gn b/test/vmapi/BUILD.gn
index da28503..fab748c 100644
--- a/test/vmapi/BUILD.gn
+++ b/test/vmapi/BUILD.gn
@@ -25,6 +25,7 @@
   deps = [
     ":primary_only_test",
     ":primary_with_secondaries_test",
+    ":primary_with_state_secondary_test",
   ]
 }
 
@@ -114,3 +115,28 @@
     ],
   ]
 }
+
+# Tests with state test secondary VM.
+vm_kernel("primary_with_state_secondary_test_vm") {
+  testonly = true
+
+  sources = [
+    "primary_for_state_test.c",
+  ]
+
+  deps = [
+    "//test/hftest:hftest_primary_vm",
+  ]
+}
+
+initrd("primary_with_state_secondary_test") {
+  testonly = true
+
+  primary_vm = ":primary_with_state_secondary_test_vm"
+  secondary_vms = [ [
+        "1048576",
+        "1",
+        "check_state",
+        "//test/vmapi/secondaries:check_state",
+      ] ]
+}
diff --git a/test/vmapi/primary_for_state_test.c b/test/vmapi/primary_for_state_test.c
new file mode 100644
index 0000000..9a3b732
--- /dev/null
+++ b/test/vmapi/primary_for_state_test.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <stdalign.h>
+#include <stdint.h>
+
+#include "hf/arch/vm/power_mgmt.h"
+
+#include "hf/mm.h"
+#include "hf/std.h"
+
+#include "vmapi/hf/call.h"
+
+#include "hftest.h"
+
+static alignas(PAGE_SIZE) uint8_t send_page[PAGE_SIZE];
+static alignas(PAGE_SIZE) uint8_t recv_page[PAGE_SIZE];
+static_assert(sizeof(send_page) == PAGE_SIZE, "Send page is not a page.");
+static_assert(sizeof(recv_page) == PAGE_SIZE, "Recv page is not a page.");
+
+static hf_ipaddr_t send_page_addr = (hf_ipaddr_t)send_page;
+static hf_ipaddr_t recv_page_addr = (hf_ipaddr_t)recv_page;
+
+#define STATE_VM_ID 1
+
+/**
+ * Iterates trying to run vCPU of the secondary VM. Returns when a message
+ * of non-zero length is received.
+ */
+bool run_loop(void)
+{
+	struct hf_vcpu_run_return run_res;
+	bool ok = false;
+
+	for (;;) {
+		/* Run until it manages to schedule vCPU on this CPU. */
+		do {
+			run_res = hf_vcpu_run(STATE_VM_ID, 0);
+		} while (run_res.code == HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+
+		/* Break out if we received a message with non-zero length. */
+		if (run_res.code == HF_VCPU_RUN_MESSAGE &&
+		    run_res.message.size != 0) {
+			break;
+		}
+
+		/* Clear mailbox so that next message can be received. */
+		hf_mailbox_clear();
+	}
+
+	/* Copies the contents of the received boolean to the return value. */
+	if (run_res.message.size == sizeof(ok)) {
+		memcpy(&ok, recv_page, sizeof(ok));
+	}
+
+	hf_mailbox_clear();
+
+	return ok;
+}
+
+/**
+ * This is the entry point of the additional primary VM vCPU. It just calls
+ * the run loop so that two cpus compete for the chance to run a secondary VM.
+ */
+static void vm_cpu_entry(uintptr_t arg)
+{
+	(void)arg;
+	run_loop();
+}
+
+/**
+ * This test tries to run the same secondary vCPU from two different physical
+ * CPUs concurrently. The vCPU checks that the state is ok while it bounces
+ * between the physical CPUs.
+ */
+TEST(vcpu_state, concurrent_save_restore)
+{
+	alignas(4096) static char stack[4096];
+
+	EXPECT_EQ(hf_vm_configure(send_page_addr, recv_page_addr), 0);
+
+	/* Start second vCPU. */
+	EXPECT_EQ(cpu_start(1, stack, sizeof(stack), vm_cpu_entry, 0), true);
+
+	/* Run on a loop until the secondary VM is done. */
+	EXPECT_EQ(run_loop(), true);
+}
diff --git a/test/vmapi/secondaries/BUILD.gn b/test/vmapi/secondaries/BUILD.gn
index 52dfd57..43fb4cc 100644
--- a/test/vmapi/secondaries/BUILD.gn
+++ b/test/vmapi/secondaries/BUILD.gn
@@ -66,3 +66,16 @@
     "//test/hftest:hftest_secondary_vm",
   ]
 }
+
+vm_kernel("check_state") {
+  testonly = true
+
+  sources = [
+    "check_state.c",
+  ]
+
+  deps = [
+    "//src/arch/aarch64/hftest:state",
+    "//test/hftest:hftest_secondary_vm",
+  ]
+}
diff --git a/test/vmapi/secondaries/check_state.c b/test/vmapi/secondaries/check_state.c
new file mode 100644
index 0000000..e05c0a7
--- /dev/null
+++ b/test/vmapi/secondaries/check_state.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2018 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdalign.h>
+#include <stdint.h>
+
+#include "hf/arch/vm/state.h"
+
+#include "hf/mm.h"
+#include "hf/std.h"
+
+#include "vmapi/hf/call.h"
+
+alignas(4096) uint8_t kstack[4096];
+
+static alignas(PAGE_SIZE) uint8_t send_page[PAGE_SIZE];
+static alignas(PAGE_SIZE) uint8_t recv_page[PAGE_SIZE];
+
+static hf_ipaddr_t send_page_addr = (hf_ipaddr_t)send_page;
+static hf_ipaddr_t recv_page_addr = (hf_ipaddr_t)recv_page;
+
+void send_with_retry(uint32_t vm_id, size_t size)
+{
+	int64_t res;
+
+	do {
+		res = hf_mailbox_send(vm_id, size);
+	} while (res == -1);
+}
+
+/**
+ * This VM repeatedly takes the following steps: sets the per-cpu pointer to
+ * some value, makes a hypervisor call, check that the value is still what it
+ * was set to.
+ *
+ * This loop helps detect bugs where the hypervisor inadvertently destroys
+ * state.
+ *
+ * At the end of its iterations, the VM reports the result to the primary VM,
+ * which then fails or succeeds the test.
+ */
+void kmain(void)
+{
+	size_t i;
+	bool ok = true;
+	static volatile uintptr_t expected;
+	static volatile uintptr_t actual;
+
+	hf_vm_configure(send_page_addr, recv_page_addr);
+	for (i = 0; i < 100000; i++) {
+		/*
+		 * We store the expected/actual values in volatile static
+		 * variables to avoid relying on registers that may have been
+		 * modified by the hypervisor.
+		 */
+		expected = i;
+		per_cpu_ptr_set(expected);
+		send_with_retry(HF_PRIMARY_VM_ID, 0);
+		actual = per_cpu_ptr_get();
+		ok &= expected == actual;
+	}
+
+	/* Send two replies, one for each physical CPU. */
+	memcpy(send_page, &ok, sizeof(ok));
+	send_with_retry(HF_PRIMARY_VM_ID, sizeof(ok));
+	send_with_retry(HF_PRIMARY_VM_ID, sizeof(ok));
+}