Switch to fully flushing the data cache.

PSCI says the caches should be clean and invalidated at the entry point
so that is what we should do. Shared memory should also be flushed to
reduce the chance of coherency issues in the VMs.

Change-Id: I99c1f1fa7b44d290dc81e7d5496f4f90192c08c5
diff --git a/inc/hf/arch/mm.h b/inc/hf/arch/mm.h
index 7156e4d..d470d08 100644
--- a/inc/hf/arch/mm.h
+++ b/inc/hf/arch/mm.h
@@ -112,10 +112,11 @@
 void arch_mm_invalidate_stage2_range(ipaddr_t va_begin, ipaddr_t va_end);
 
 /**
- * Writes the given range of virtual memory back to the point of unification so
- * all cores and devices will see the updated values.
+ * Writes back the given range of virtual memory to such a point that all cores
+ * and devices will see the updated values. The corresponding cache lines are
+ * also invalidated.
  */
-void arch_mm_write_back_dcache(void *base, size_t size);
+void arch_mm_flush_dcache(void *base, size_t size);
 
 /**
  * Gets the maximum level allowed in the page table for stage-1.
diff --git a/src/api.c b/src/api.c
index 1fed670..f4ec664 100644
--- a/src/api.c
+++ b/src/api.c
@@ -1382,7 +1382,7 @@
 	}
 
 	memset_s(ptr, size, 0, size);
-	arch_mm_write_back_dcache(ptr, size);
+	arch_mm_flush_dcache(ptr, size);
 	mm_unmap(stage1_locked, begin, end, ppool);
 
 	ret = true;
diff --git a/src/arch/aarch64/mm.c b/src/arch/aarch64/mm.c
index 74102ee..595f9c0 100644
--- a/src/arch/aarch64/mm.c
+++ b/src/arch/aarch64/mm.c
@@ -370,19 +370,15 @@
 	       (UINT16_C(1) << ((read_msr(CTR_EL0) >> 16) & 0xf));
 }
 
-/**
- * Ensures that the range of data in the cache is written back so that it is
- * visible to all cores in the system.
- */
-void arch_mm_write_back_dcache(void *base, size_t size)
+void arch_mm_flush_dcache(void *base, size_t size)
 {
-	/* Clean each data cache line that corresponds to data in the range. */
+	/* Clean and invalidate each data cache line in the range. */
 	uint16_t line_size = arch_mm_dcache_line_size();
 	uintptr_t line_begin = (uintptr_t)base & ~(line_size - 1);
 	uintptr_t end = (uintptr_t)base + size;
 
 	while (line_begin < end) {
-		__asm__ volatile("dc cvac, %0" : : "r"(line_begin));
+		__asm__ volatile("dc civac, %0" : : "r"(line_begin));
 		line_begin += line_size;
 	}
 	dsb(sy);
diff --git a/src/arch/fake/mm.c b/src/arch/fake/mm.c
index 4cda7cf..c788518 100644
--- a/src/arch/fake/mm.c
+++ b/src/arch/fake/mm.c
@@ -121,7 +121,7 @@
 	/* There's no modelling of the stage-2 TLB. */
 }
 
-void arch_mm_write_back_dcache(void *base, size_t size)
+void arch_mm_flush_dcache(void *base, size_t size)
 {
 	/* There's no modelling of the cache. */
 }
diff --git a/src/load.c b/src/load.c
index cb4fa90..db19e05 100644
--- a/src/load.c
+++ b/src/load.c
@@ -53,7 +53,7 @@
 	}
 
 	memcpy_s(ptr, size, from, size);
-	arch_mm_write_back_dcache(ptr, size);
+	arch_mm_flush_dcache(ptr, size);
 
 	mm_unmap(stage1_locked, to, to_end, ppool);
 
diff --git a/test/hftest/power_mgmt.c b/test/hftest/power_mgmt.c
index 576c6bd..2daba31 100644
--- a/test/hftest/power_mgmt.c
+++ b/test/hftest/power_mgmt.c
@@ -67,10 +67,12 @@
 	s_arch.arg = (uintptr_t)&s;
 
 	/*
-	 * Write back the `cpu_start_state` struct because the new CPU will be
+	 * Flush the `cpu_start_state` struct because the new CPU will be
 	 * started without caching enabled and will need the data early on.
+	 * Write back is all that is really needed so flushing will definitely
+	 * get the job done.
 	 */
-	arch_mm_write_back_dcache(&s_arch, sizeof(s_arch));
+	arch_mm_flush_dcache(&s_arch, sizeof(s_arch));
 
 	if ((s_arch.initial_sp % STACK_ALIGN) != 0) {
 		HFTEST_FAIL(true,