Blame - linux-x64/clang/include/llvm/MCA/HardwareUnits/LSUnit.h - hafnium/prebuilts

blob: b8a9f2755b0553fc77c998653345b4c87aa3a97e [file] [log] [blame]

Andrew Walbran	16937d0	2019-10-22 13:54:20 +0100	[diff] [blame^]	1	//===------------------------- LSUnit.h --------------------------- C++--===//
				2	//
				3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				6	//
				7	//===----------------------------------------------------------------------===//
				8	/// \file
				9	///
				10	/// A Load/Store unit class that models load/store queues and that implements
				11	/// a simple weak memory consistency model.
				12	///
				13	//===----------------------------------------------------------------------===//
				14
				15	#ifndef LLVM_MCA_LSUNIT_H
				16	#define LLVM_MCA_LSUNIT_H
				17
				18	#include "llvm/ADT/SmallSet.h"
				19	#include "llvm/MC/MCSchedule.h"
				20	#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
				21
				22	namespace llvm {
				23	namespace mca {
				24
				25	class InstRef;
				26	class Scheduler;
				27
				28	/// A Load/Store Unit implementing a load and store queues.
				29	///
				30	/// This class implements a load queue and a store queue to emulate the
				31	/// out-of-order execution of memory operations.
				32	/// Each load (or store) consumes an entry in the load (or store) queue.
				33	///
				34	/// Rules are:
				35	/// 1) A younger load is allowed to pass an older load only if there are no
				36	/// stores nor barriers in between the two loads.
				37	/// 2) An younger store is not allowed to pass an older store.
				38	/// 3) A younger store is not allowed to pass an older load.
				39	/// 4) A younger load is allowed to pass an older store only if the load does
				40	/// not alias with the store.
				41	///
				42	/// This class optimistically assumes that loads don't alias store operations.
				43	/// Under this assumption, younger loads are always allowed to pass older
				44	/// stores (this would only affects rule 4).
				45	/// Essentially, this class doesn't perform any sort alias analysis to
				46	/// identify aliasing loads and stores.
				47	///
				48	/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
				49	/// set to `false` by the constructor of LSUnit.
				50	///
				51	/// Note that this class doesn't know about the existence of different memory
				52	/// types for memory operations (example: write-through, write-combining, etc.).
				53	/// Derived classes are responsible for implementing that extra knowledge, and
				54	/// provide different sets of rules for loads and stores by overriding method
				55	/// `isReady()`.
				56	/// To emulate a write-combining memory type, rule 2. must be relaxed in a
				57	/// derived class to enable the reordering of non-aliasing store operations.
				58	///
				59	/// No assumptions are made by this class on the size of the store buffer. This
				60	/// class doesn't know how to identify cases where store-to-load forwarding may
				61	/// occur.
				62	///
				63	/// LSUnit doesn't attempt to predict whether a load or store hits or misses
				64	/// the L1 cache. To be more specific, LSUnit doesn't know anything about
				65	/// cache hierarchy and memory types.
				66	/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
				67	/// scheduling model provides an "optimistic" load-to-use latency (which usually
				68	/// matches the load-to-use latency for when there is a hit in the L1D).
				69	/// Derived classes may expand this knowledge.
				70	///
				71	/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
				72	/// memory-barrier like instructions.
				73	/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
				74	/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
				75	/// serializes loads without forcing a flush of the load queue.
				76	/// Similarly, instructions that both `mayStore` and have `unmodeled side
				77	/// effects` are treated like store barriers. A full memory
				78	/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
				79	/// effects. This is obviously inaccurate, but this is the best that we can do
				80	/// at the moment.
				81	///
				82	/// Each load/store barrier consumes one entry in the load/store queue. A
				83	/// load/store barrier enforces ordering of loads/stores:
				84	/// - A younger load cannot pass a load barrier.
				85	/// - A younger store cannot pass a store barrier.
				86	///
				87	/// A younger load has to wait for the memory load barrier to execute.
				88	/// A load/store barrier is "executed" when it becomes the oldest entry in
				89	/// the load/store queue(s). That also means, all the older loads/stores have
				90	/// already been executed.
				91	class LSUnit : public HardwareUnit {
				92	// Load queue size.
				93	// LQ_Size == 0 means that there are infinite slots in the load queue.
				94	unsigned LQ_Size;
				95
				96	// Store queue size.
				97	// SQ_Size == 0 means that there are infinite slots in the store queue.
				98	unsigned SQ_Size;
				99
				100	// If true, loads will never alias with stores. This is the default.
				101	bool NoAlias;
				102
				103	// When a `MayLoad` instruction is dispatched to the schedulers for execution,
				104	// the LSUnit reserves an entry in the `LoadQueue` for it.
				105	//
				106	// LoadQueue keeps track of all the loads that are in-flight. A load
				107	// instruction is eventually removed from the LoadQueue when it reaches
				108	// completion stage. That means, a load leaves the queue whe it is 'executed',
				109	// and its value can be forwarded on the data path to outside units.
				110	//
				111	// This class doesn't know about the latency of a load instruction. So, it
				112	// conservatively/pessimistically assumes that the latency of a load opcode
				113	// matches the instruction latency.
				114	//
				115	// FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
				116	// and load/store conflicts, the latency of a load is determined by the depth
				117	// of the load pipeline. So, we could use field `LoadLatency` in the
				118	// MCSchedModel to model that latency.
				119	// Field `LoadLatency` often matches the so-called 'load-to-use' latency from
				120	// L1D, and it usually already accounts for any extra latency due to data
				121	// forwarding.
				122	// When doing throughput analysis, `LoadLatency` is likely to
				123	// be a better predictor of load latency than instruction latency. This is
				124	// particularly true when simulating code with temporal/spatial locality of
				125	// memory accesses.
				126	// Using `LoadLatency` (instead of the instruction latency) is also expected
				127	// to improve the load queue allocation for long latency instructions with
				128	// folded memory operands (See PR39829).
				129	//
				130	// FIXME: On some processors, load/store operations are split into multiple
				131	// uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
				132	// not 256-bit data types. So, a 256-bit load is effectively split into two
				133	// 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
				134	// simplicity, this class optimistically assumes that a load instruction only
				135	// consumes one entry in the LoadQueue. Similarly, store instructions only
				136	// consume a single entry in the StoreQueue.
				137	// In future, we should reassess the quality of this design, and consider
				138	// alternative approaches that let instructions specify the number of
				139	// load/store queue entries which they consume at dispatch stage (See
				140	// PR39830).
				141	SmallSet<unsigned, 16> LoadQueue;
				142	SmallSet<unsigned, 16> StoreQueue;
				143
				144	void assignLQSlot(unsigned Index);
				145	void assignSQSlot(unsigned Index);
				146	bool isReadyNoAlias(unsigned Index) const;
				147
				148	// An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
				149	// conservatively treated as a store barrier. It forces older store to be
				150	// executed before newer stores are issued.
				151	SmallSet<unsigned, 8> StoreBarriers;
				152
				153	// An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
				154	// conservatively treated as a load barrier. It forces older loads to execute
				155	// before newer loads are issued.
				156	SmallSet<unsigned, 8> LoadBarriers;
				157
				158	bool isSQEmpty() const { return StoreQueue.empty(); }
				159	bool isLQEmpty() const { return LoadQueue.empty(); }
				160	bool isSQFull() const { return SQ_Size != 0 && StoreQueue.size() == SQ_Size; }
				161	bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
				162
				163	public:
				164	LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0,
				165	bool AssumeNoAlias = false);
				166
				167	#ifndef NDEBUG
				168	void dump() const;
				169	#endif
				170
				171	enum Status { LSU_AVAILABLE = 0, LSU_LQUEUE_FULL, LSU_SQUEUE_FULL };
				172
				173	// Returns LSU_AVAILABLE if there are enough load/store queue entries to serve
				174	// IR. It also returns LSU_AVAILABLE if IR is not a memory operation.
				175	Status isAvailable(const InstRef &IR) const;
				176
				177	// Allocates load/store queue resources for IR.
				178	//
				179	// This method assumes that a previous call to `isAvailable(IR)` returned
				180	// LSU_AVAILABLE, and that IR is a memory operation.
				181	void dispatch(const InstRef &IR);
				182
				183	// By default, rules are:
				184	// 1. A store may not pass a previous store.
				185	// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
				186	// 3. A load may pass a previous load.
				187	// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
				188	// 5. A load has to wait until an older load barrier is fully executed.
				189	// 6. A store has to wait until an older store barrier is fully executed.
				190	virtual bool isReady(const InstRef &IR) const;
				191
				192	// Load and store instructions are tracked by their corresponding queues from
				193	// dispatch until the "instruction executed" event.
				194	// Only when a load instruction reaches the 'Executed' stage, its value
				195	// becomes available to the users. At that point, the load no longer needs to
				196	// be tracked by the load queue.
				197	// FIXME: For simplicity, we optimistically assume a similar behavior for
				198	// store instructions. In practice, store operations don't tend to leave the
				199	// store queue until they reach the 'Retired' stage (See PR39830).
				200	void onInstructionExecuted(const InstRef &IR);
				201	};
				202
				203	} // namespace mca
				204	} // namespace llvm
				205
				206	#endif // LLVM_MCA_LSUNIT_H