UoB-HPC · FinnWilkinson · May 24, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/configs/a64fx_SME.yaml b/configs/a64fx_SME.yaml
@@ -77,15 +77,15 @@ Ports:
     - INT_DIV_OR_SQRT
   5:
     Portname: EAGA
-    Instruction-Support: 
+    Instruction-Group-Support: 
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
     - INT_SIMPLE_LOGICAL_NOSHIFT
     - INT_SIMPLE_CMP
   6:
     Portname: EAGB
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
@@ -95,10 +95,24 @@ Ports:
     Portname: BR
     Instruction-Group-Support:
     - BRANCH
+# Define example SME / SVE Streaming Mode units
   8:
     Portname: SME
     Instruction-Group-Support:
     - SME
+  9:
+    Portname: PR_S
+    Instruction-Group-Support:
+    - STREAMING_PREDICATE
+  10:
+    Portname: FLA_S
+    Instruction-Group-Support:
+    - STREAMING_SVE
+  11:
+    Portname: FLB_S
+    Instruction-Group-Support:
+    - STREAMING_SVE_SIMPLE
+    - STREAMING_SVE_MUL
 Reservation-Stations:
   0:
     Size: 20
@@ -133,6 +147,13 @@ Reservation-Stations:
     Dispatch-Rate: 1
     Ports:
     - SME
+  6:
+    Size: 40
+    Dispatch-Rate: 3
+    Ports:
+    - FLA_S
+    - FLB_S
+    - PR_S
 Execution-Units:
   0:
     Pipelined: True
@@ -188,6 +209,24 @@ Execution-Units:
     - INT_DIV_OR_SQRT
     - FP_DIV_OR_SQRT
     - SVE_DIV_OR_SQRT
+  9:
+    Pipelined: True
+    Blocking-Groups:
+    - INT_DIV_OR_SQRT
+    - FP_DIV_OR_SQRT
+    - SVE_DIV_OR_SQRT
+  10:
+    Pipelined: True
+    Blocking-Groups:
+    - INT_DIV_OR_SQRT
+    - FP_DIV_OR_SQRT
+    - SVE_DIV_OR_SQRT
+  11:
+    Pipelined: True
+    Blocking-Groups:
+    - INT_DIV_OR_SQRT
+    - FP_DIV_OR_SQRT
+    - SVE_DIV_OR_SQRT
 Latencies:
   0:
     Instruction-Groups: 
@@ -216,9 +255,11 @@ Latencies:
     - SCALAR_SIMPLE
     - VECTOR_SIMPLE_LOGICAL
     - SVE_SIMPLE_LOGICAL
+    - STREAMING_SVE_SIMPLE_LOGICAL
     - SME_SIMPLE_LOGICAL
     - VECTOR_SIMPLE_CMP
     - SVE_SIMPLE_CMP
+    - STREAMING_SVE_SIMPLE_CMP
     - SME_SIMPLE_CMP
     Execution-Latency: 4
     Execution-Throughput: 1
@@ -232,21 +273,25 @@ Latencies:
     - SCALAR_SIMPLE_CVT
     - VECTOR_SIMPLE
     - SVE_SIMPLE
+    - STREAMING_SVE_SIMPLE
     - SME_SIMPLE
     - FP_MUL
     - SVE_MUL
+    - STREAMING_SVE_MUL
     - SME_MUL
     Execution-Latency: 9
     Execution-Throughput: 1
   7:
     Instruction-Groups: 
     - SVE_DIV_OR_SQRT
+    - STREAMING_SVE_DIV_OR_SQRT
     - SME_DIV_OR_SQRT
     Execution-Latency: 98
     Execution-Throughput: 98
   8:
     Instruction-Groups: 
     - PREDICATE
+    - STREAMING_PREDICATE
     Execution-Latency: 3
     Execution-Throughput: 1
   9:
@@ -260,8 +305,10 @@ Latencies:
   10:
     Instruction-Groups: 
     - LOAD_SVE
+    - LOAD_STREAMING_SVE
     - LOAD_SME
     - STORE_ADDRESS_SVE
+    - STORE_ADDRESS_STREAMING_SVE
     - STORE_ADDRESS_SME
     Execution-Latency: 6
     Execution-Throughput: 1

diff --git a/docs/sphinx/assets/instruction_groups.png b/docs/sphinx/assets/instruction_groups.png
diff --git a/docs/sphinx/assets/instruction_groups_AArch64.png b/docs/sphinx/assets/instruction_groups_AArch64.png
diff --git a/src/include/simeng/Register.hh b/src/include/simeng/Register.hh
@@ -1,6 +1,5 @@
 #pragma once
 #include <cstdint>
-#include <iostream>
 
 namespace simeng {
 

diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
@@ -70,6 +70,12 @@ class Architecture : public arch::Architecture {
   /** Returns the current value of SVCRval_. */
   uint64_t getSVCRval() const;
 
+  /** Returns if SVE Streaming Mode is enabled. */
+  bool isStreamingModeEnabled() const;
+
+  /** Returns if the SME ZA Register is enabled. */
+  bool isZARegisterEnabled() const;
+
   /** Update the value of SVCRval_. */
   void setSVCRval(const uint64_t newVal) const;
 

diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -370,6 +370,12 @@ class Instruction : public simeng::Instruction {
    * processing this instruction. */
   InstructionException getException() const;
 
+  /** Checks whether the current SVE Streaming Mode status is different to when
+   * this instruction was first decoded, and updates the instruction group
+   * accordingly if required.
+   * Returns TRUE if the group was updated, FALSE otherwise. */
+  bool checkStreamingGroupAndUpdate();
+
  private:
   /** Process the instruction's metadata to determine source/destination
    * registers. */
@@ -451,6 +457,9 @@ class Instruction : public simeng::Instruction {
    * the `InsnType` namespace allowing each bit to represent a unique
    * identifier such as `isLoad` or `isMultiply` etc. */
   uint32_t instructionIdentifier_ = 0;
+
+  /** The instruction group this instruction belongs to. */
+  uint16_t instructionGroup_ = InstructionGroups::NONE;
 };
 
 }  // namespace aarch64

diff --git a/src/include/simeng/arch/aarch64/InstructionGroups.hh b/src/include/simeng/arch/aarch64/InstructionGroups.hh
@@ -4,7 +4,33 @@ namespace simeng {
 namespace arch {
 namespace aarch64 {
 
-/** The IDs of the instruction groups for AArch64 instructions. */
+/** The IDs of the instruction groups for AArch64 instructions.
+ * Each new group must contain 14 entries to ensure correct group assignment and
+ * general functionality.
+ * Their order must be as follows:
+ *  - BASE
+ *  - BASE_SIMPLE
+ *  - BASE_SIMPLE_ARTH
+ *  - BASE_SIMPLE_ARTH_NOSHIFT
+ *  - BASE_SIMPLE_LOGICAL
+ *  - BASE_SIMPLE_LOGICAL_NOSHIFT
+ *  - BASE_SIMPLE_CMP
+ *  - BASE_SIMPLE_CVT
+ *  - BASE_MUL
+ *  - BASE_DIV_OR_SQRT
+ *  - LOAD_BASE
+ *  - STORE_ADDRESS_BASE
+ *  - STORE_DATA_BASE
+ *  - STORE_BASE
+ *
+ * An exception to the above is "Parent" groups which do not require the LOAD_*
+ * or STORE_* groups.
+ * "Parent" groups allow for easier grouping of similar groups that may have
+ * identical execution latencies, ports, etc. For example, FP is the parent
+ * group of SCALAR and VECTOR.
+ * In simulation, an instruction's allocated group will never be a "Parent"
+ * group; they are only used to simplify config file creation and management.
+ */
 namespace InstructionGroups {
 const uint16_t INT = 0;
 const uint16_t INT_SIMPLE = 1;
@@ -72,37 +98,53 @@ const uint16_t LOAD_SVE = 62;
 const uint16_t STORE_ADDRESS_SVE = 63;
 const uint16_t STORE_DATA_SVE = 64;
 const uint16_t STORE_SVE = 65;
-const uint16_t PREDICATE = 66;
-const uint16_t LOAD = 67;
-const uint16_t STORE_ADDRESS = 68;
-const uint16_t STORE_DATA = 69;
-const uint16_t STORE = 70;
-const uint16_t BRANCH = 71;
-const uint16_t SME = 72;
-const uint16_t SME_SIMPLE = 73;
-const uint16_t SME_SIMPLE_ARTH = 74;
-const uint16_t SME_SIMPLE_ARTH_NOSHIFT = 75;
-const uint16_t SME_SIMPLE_LOGICAL = 76;
-const uint16_t SME_SIMPLE_LOGICAL_NOSHIFT = 77;
-const uint16_t SME_SIMPLE_CMP = 78;
-const uint16_t SME_SIMPLE_CVT = 79;
-const uint16_t SME_MUL = 80;
-const uint16_t SME_DIV_OR_SQRT = 81;
-const uint16_t LOAD_SME = 82;
-const uint16_t STORE_ADDRESS_SME = 83;
-const uint16_t STORE_DATA_SME = 84;
-const uint16_t STORE_SME = 85;
-const uint16_t ALL = 86;
-const uint16_t NONE = 87;
+const uint16_t STREAMING_SVE = 66;
+const uint16_t STREAMING_SVE_SIMPLE = 67;
+const uint16_t STREAMING_SVE_SIMPLE_ARTH = 68;
+const uint16_t STREAMING_SVE_SIMPLE_ARTH_NOSHIFT = 69;
+const uint16_t STREAMING_SVE_SIMPLE_LOGICAL = 70;
+const uint16_t STREAMING_SVE_SIMPLE_LOGICAL_NOSHIFT = 71;
+const uint16_t STREAMING_SVE_SIMPLE_CMP = 72;
+const uint16_t STREAMING_SVE_SIMPLE_CVT = 73;
+const uint16_t STREAMING_SVE_MUL = 74;
+const uint16_t STREAMING_SVE_DIV_OR_SQRT = 75;
+const uint16_t LOAD_STREAMING_SVE = 76;
+const uint16_t STORE_ADDRESS_STREAMING_SVE = 77;
+const uint16_t STORE_DATA_STREAMING_SVE = 78;
+const uint16_t STORE_STREAMING_SVE = 79;
+const uint16_t SME = 80;
+const uint16_t SME_SIMPLE = 81;
+const uint16_t SME_SIMPLE_ARTH = 82;
+const uint16_t SME_SIMPLE_ARTH_NOSHIFT = 83;
+const uint16_t SME_SIMPLE_LOGICAL = 84;
+const uint16_t SME_SIMPLE_LOGICAL_NOSHIFT = 85;
+const uint16_t SME_SIMPLE_CMP = 86;
+const uint16_t SME_SIMPLE_CVT = 87;
+const uint16_t SME_MUL = 88;
+const uint16_t SME_DIV_OR_SQRT = 89;
+const uint16_t LOAD_SME = 90;
+const uint16_t STORE_ADDRESS_SME = 91;
+const uint16_t STORE_DATA_SME = 92;
+const uint16_t STORE_SME = 93;
+const uint16_t PREDICATE = 94;
+const uint16_t STREAMING_PREDICATE = 95;
+const uint16_t LOAD = 96;
+const uint16_t STORE_ADDRESS = 97;
+const uint16_t STORE_DATA = 98;
+const uint16_t STORE = 99;
+const uint16_t BRANCH = 100;
+const uint16_t ALL = 101;
+const uint16_t NONE = 102;
 }  // namespace InstructionGroups
 
 /** The number of aarch64 instruction groups. */
-static constexpr uint8_t NUM_GROUPS = 88;
+static constexpr uint8_t NUM_GROUPS = 103;
 
 const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance_ = {
     {InstructionGroups::ALL,
      {InstructionGroups::INT, InstructionGroups::FP, InstructionGroups::SVE,
-      InstructionGroups::PREDICATE, InstructionGroups::SME,
+      InstructionGroups::STREAMING_SVE, InstructionGroups::SME,
+      InstructionGroups::PREDICATE, InstructionGroups::STREAMING_PREDICATE,
       InstructionGroups::LOAD, InstructionGroups::STORE,
       InstructionGroups::BRANCH}},
     {InstructionGroups::INT,
@@ -176,6 +218,19 @@ const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance_ = {
      {InstructionGroups::SVE_SIMPLE_ARTH_NOSHIFT}},
     {InstructionGroups::SVE_SIMPLE_LOGICAL,
      {InstructionGroups::SVE_SIMPLE_LOGICAL_NOSHIFT}},
+    {InstructionGroups::STREAMING_SVE,
+     {InstructionGroups::STREAMING_SVE_SIMPLE,
+      InstructionGroups::STREAMING_SVE_DIV_OR_SQRT,
+      InstructionGroups::STREAMING_SVE_MUL}},
+    {InstructionGroups::STREAMING_SVE_SIMPLE,
+     {InstructionGroups::STREAMING_SVE_SIMPLE_ARTH,
+      InstructionGroups::STREAMING_SVE_SIMPLE_LOGICAL,
+      InstructionGroups::STREAMING_SVE_SIMPLE_CMP,
+      InstructionGroups::STREAMING_SVE_SIMPLE_CVT}},
+    {InstructionGroups::STREAMING_SVE_SIMPLE_ARTH,
+     {InstructionGroups::STREAMING_SVE_SIMPLE_ARTH_NOSHIFT}},
+    {InstructionGroups::STREAMING_SVE_SIMPLE_LOGICAL,
+     {InstructionGroups::STREAMING_SVE_SIMPLE_LOGICAL_NOSHIFT}},
     {InstructionGroups::SME,
      {InstructionGroups::SME_SIMPLE, InstructionGroups::SME_DIV_OR_SQRT,
       InstructionGroups::SME_MUL}},
@@ -189,11 +244,11 @@ const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance_ = {
     {InstructionGroups::LOAD,
      {InstructionGroups::LOAD_INT, InstructionGroups::LOAD_SCALAR,
       InstructionGroups::LOAD_VECTOR, InstructionGroups::LOAD_SVE,
-      InstructionGroups::LOAD_SME}},
+      InstructionGroups::LOAD_STREAMING_SVE, InstructionGroups::LOAD_SME}},
     {InstructionGroups::STORE,
      {InstructionGroups::STORE_INT, InstructionGroups::STORE_SCALAR,
       InstructionGroups::STORE_VECTOR, InstructionGroups::STORE_SVE,
-      InstructionGroups::STORE_SME}},
+      InstructionGroups::STORE_STREAMING_SVE, InstructionGroups::STORE_SME}},
     {InstructionGroups::STORE_INT,
      {InstructionGroups::STORE_ADDRESS_INT, InstructionGroups::STORE_DATA_INT}},
     {InstructionGroups::STORE_SCALAR,
@@ -204,17 +259,22 @@ const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance_ = {
       InstructionGroups::STORE_DATA_VECTOR}},
     {InstructionGroups::STORE_SVE,
      {InstructionGroups::STORE_ADDRESS_SVE, InstructionGroups::STORE_DATA_SVE}},
+    {InstructionGroups::STORE_STREAMING_SVE,
+     {InstructionGroups::STORE_ADDRESS_STREAMING_SVE,
+      InstructionGroups::STORE_DATA_STREAMING_SVE}},
     {InstructionGroups::STORE_SME,
      {InstructionGroups::STORE_ADDRESS_SME, InstructionGroups::STORE_DATA_SME}},
     {InstructionGroups::STORE_ADDRESS,
      {InstructionGroups::STORE_ADDRESS_INT,
       InstructionGroups::STORE_ADDRESS_SCALAR,
       InstructionGroups::STORE_ADDRESS_VECTOR,
       InstructionGroups::STORE_ADDRESS_SVE,
+      InstructionGroups::STORE_ADDRESS_STREAMING_SVE,
       InstructionGroups::STORE_ADDRESS_SME}},
     {InstructionGroups::STORE_DATA,
      {InstructionGroups::STORE_DATA_INT, InstructionGroups::STORE_DATA_SCALAR,
       InstructionGroups::STORE_DATA_VECTOR, InstructionGroups::STORE_DATA_SVE,
+      InstructionGroups::STORE_DATA_STREAMING_SVE,
       InstructionGroups::STORE_DATA_SME}}};
 
 }  // namespace aarch64

diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -568,9 +568,14 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
+  // Compare each adjacent pair of elements
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::max(n[i], m[i]);
+    out[i] = std::max(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }
@@ -585,9 +590,14 @@ RegisterValue vecUMinP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, m, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), n, sizeof(T) * I);
+
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::min(n[i], m[i]);
+    out[i] = std::min(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }

diff --git a/src/include/simeng/arch/riscv/Instruction.hh b/src/include/simeng/arch/riscv/Instruction.hh
@@ -252,6 +252,9 @@ class Instruction : public simeng::Instruction {
    * the `InsnType` namespace allowing each bit to represent a unique
    * identifier such as `isLoad` or `isMultiply` etc. */
   uint16_t instructionIdentifier_ = 0;
+
+  /** The instruction group this instruction belongs to. */
+  uint16_t instructionGroup_ = InstructionGroups::NONE;
 };
 
 }  // namespace riscv