Revert^2: Improve spinlock implementation for ARMv8.0

Current implementation of spinlocks using C11 atomics compiles into an
LDAXR/STXR pair which we observe does not always make forward progress
on Cortex A72 (concurrent_save_restore test livelocks).

This patch inserts a WFE instruction when the LDAXR returns that the
lock is currently taken, pausing the thread until an event is
triggered and easing off the contention. This is observed to help
forward progress on A72 and should also improve power consumption
of our spinlocks.

Note that there is still no guarantee of forward progress and threads
can be shown to livelock on the A72 if the loops are extremely tight.
This should be revisited in the future, possibly by leveraging ARMv8.1
LSE atomics.

This reverts commit c4dfdb789421c7658ce59e24eb60cda38335c2f2.

Test: vcpu_state.concurrent_save_restore
Bug: 141087046
Change-Id: I0435632e4994e74457102ae7133bf77b47c0283f
diff --git a/inc/hf/spinlock.h b/inc/hf/spinlock.h
index dbd7890..2689a5c 100644
--- a/inc/hf/spinlock.h
+++ b/inc/hf/spinlock.h
@@ -16,27 +16,18 @@
 
 #pragma once
 
-#include <stdatomic.h>
-
-struct spinlock {
-	atomic_flag v;
-};
-
-#define SPINLOCK_INIT                 \
-	{                             \
-		.v = ATOMIC_FLAG_INIT \
-	}
+/*
+ * Includes the arch-specific definition of 'struct spinlock' and
+ * implementations of:
+ *  - SPINLOCK_INIT
+ *  - sl_lock()
+ *  - sl_unlock()
+ */
+#include "hf/arch/spinlock.h"
 
 static inline void sl_init(struct spinlock *l)
 {
-	*l = (struct spinlock)SPINLOCK_INIT;
-}
-
-static inline void sl_lock(struct spinlock *l)
-{
-	while (atomic_flag_test_and_set_explicit(&l->v, memory_order_acquire)) {
-		/* do nothing */
-	}
+	*l = SPINLOCK_INIT;
 }
 
 /**
@@ -53,8 +44,3 @@
 		sl_lock(a);
 	}
 }
-
-static inline void sl_unlock(struct spinlock *l)
-{
-	atomic_flag_clear_explicit(&l->v, memory_order_release);
-}
diff --git a/src/arch/aarch64/inc/hf/arch/spinlock.h b/src/arch/aarch64/inc/hf/arch/spinlock.h
new file mode 100644
index 0000000..96b01c8
--- /dev/null
+++ b/src/arch/aarch64/inc/hf/arch/spinlock.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The Hafnium Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/**
+ * Spinlock implementation using ARMv8.0 LDXR/STXR pair and a WFE pause.
+ *
+ * Implementation using C11 atomics also generates a LDXR/STXR pair but no WFE.
+ * Without it we observe that Cortex A72 can easily livelock and not make
+ * forward progress.
+ *
+ * TODO(b/141087046): Forward progress is still not guaranteed as even with WFE
+ * we see that A72 can livelock for extremely tight loops. We should investigate
+ * the guarantees provided by atomic instructions introduced in ARMv8.1 LSE.
+ */
+
+#include <stdint.h>
+
+#include "hf/arch/types.h"
+
+struct spinlock {
+	volatile uint32_t v;
+};
+
+#define SPINLOCK_INIT ((struct spinlock){.v = 0})
+
+static inline void sl_lock(struct spinlock *l)
+{
+	register uintreg_t tmp1;
+	register uintreg_t tmp2;
+
+	/*
+	 * Acquire the lock with a LDAXR/STXR pair (acquire semantics on the
+	 * load instruction). Pause using WFE if the lock is currently taken.
+	 * This is NOT guaranteed to make progress.
+	 */
+	__asm__ volatile(
+		"	mov	%w2, #1\n"
+		"	sevl\n" /* set event bit */
+		"1:	wfe\n"  /* wait for event, clear event bit */
+		"2:	ldaxr	%w1, [%0]\n"      /* load lock value */
+		"	cbnz	%w1, 1b\n"	/* if lock taken, goto WFE */
+		"	stxr	%w1, %w2, [%0]\n" /* try to take lock */
+		"	cbnz	%w1, 2b\n"	/* loop if unsuccessful */
+		: "+r"(l), "=&r"(tmp1), "=&r"(tmp2)
+		:
+		: "cc");
+}
+
+static inline void sl_unlock(struct spinlock *l)
+{
+	/*
+	 * Store zero to lock's value with release semantics. This triggers an
+	 * event which wakes up other threads waiting on a lock (no SEV needed).
+	 */
+	__asm__ volatile("stlr wzr, [%0]" : "+r"(l)::"cc");
+}
diff --git a/src/arch/fake/inc/hf/arch/spinlock.h b/src/arch/fake/inc/hf/arch/spinlock.h
new file mode 100644
index 0000000..db3b45c
--- /dev/null
+++ b/src/arch/fake/inc/hf/arch/spinlock.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2018 The Hafnium Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/**
+ * Generic implementation of a spinlock using C11 atomics.
+ * Does not work very well under contention.
+ */
+
+#include <stdatomic.h>
+
+struct spinlock {
+	atomic_flag v;
+};
+
+#define SPINLOCK_INIT ((struct spinlock){.v = ATOMIC_FLAG_INIT})
+
+static inline void sl_lock(struct spinlock *l)
+{
+	while (atomic_flag_test_and_set_explicit(&l->v, memory_order_acquire)) {
+		/* do nothing */
+	}
+}
+
+static inline void sl_unlock(struct spinlock *l)
+{
+	atomic_flag_clear_explicit(&l->v, memory_order_release);
+}
diff --git a/test/vmapi/primary_with_secondaries/run_race.c b/test/vmapi/primary_with_secondaries/run_race.c
index 4c10e86..0ef3387 100644
--- a/test/vmapi/primary_with_secondaries/run_race.c
+++ b/test/vmapi/primary_with_secondaries/run_race.c
@@ -77,6 +77,9 @@
  * This test tries to run the same secondary vCPU from two different physical
  * CPUs concurrently. The vCPU checks that the state is ok while it bounces
  * between the physical CPUs.
+ *
+ * Test is marked long-running because our implementation of spin-locks does not
+ * perform well under QEMU.
  */
 TEST_LONG_RUNNING(vcpu_state, concurrent_save_restore)
 {