Fixed CCT on L3 Bugs

pulp-platform · Feb 23, 2025 · 3df3245 · 3df3245
1 parent 026210f
commit 3df3245
Show file tree

Hide file tree

Showing 32 changed files with 100 additions and 29 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -74,7 +74,7 @@ jobs:
         ICCT_ITA_8
         miniMobileNet
         miniMobileNetv2
-        CCT/CCT_16_16_8
+        CCT/CCT_1_16_16_8
   
 
   ### CortexM Tests ###
@@ -249,6 +249,7 @@ jobs:
         MLPerf/KeywordSpotting
         MLPerf/ImageClassification
         MLPerf/AnomalyDetection
+        CCT/CCT_1_16_16_8
       num-cores: 8
 
   siracusa-kernels-tiled-singlebuffer-L2:
@@ -429,7 +430,7 @@ jobs:
             L1: [64000]
           - name: "MLPerf/AnomalyDetection"
             L1: [64000]
-          - name: "CCT/CCT_16_16_8"
+          - name: "CCT/CCT_1_16_16_8"
             L1: [64000]
         num-cores:
           - 8
@@ -456,6 +457,8 @@ jobs:
             L1: [60000, 30000, 15000]
           - name: "microLlama/microLlama1"
             L1: [60000, 10000, 5000]
+          - name: "CCT/CCT_1_32_32_8"
+            L1: [64000]
         num-cores:
           - 8
         default-memory-level:
@@ -488,6 +491,8 @@ jobs:
             L1: [60000, 20000, 10000]
           - name: "microLlama/microLlama8_parallel"
             L1: [60000, 20000, 10000]
+          - name: "CCT/CCT_1_32_32_8"
+            L1: [64000]
         num-cores:
           - 8
         double-buffer:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -136,8 +136,23 @@ Change main.c to use OUTPUTTYPE instead of float
 - CCT onnx tests with img size of 16 and 32
 
 ### Fixed
-- CycleMeasure Pass for Siracusa Untiling Profilling
+- CycleMeasure Pass for Siracusa Untiled Profilling
 - GEMM Tiling Constraints transA and `transB' not supported
 - MatMul layer Multi-Dimensional Input Issue
 - Add Layer for Broadcasted Bias
 - Resolved an issue where concatenation of float32 with f caused inf errors during code generation
+
+## Fix Float CCT Bugs on L3
+
+### Added
+- Added multiple CCT settings for testing.
+- Added CCT L3 test to CI to ensure correctness for img size of 16 and 32.
+- Added NaN check for deeploytest diff to improve result validation.
+
+### Changed
+- Regenerated CCT ONNX files without "output" & "input" in their names to avoid triggering the dumphex parser bug.
+- Regenerated CCT ONNX file with 3 branches for attention, transforming the attention computation graph into three branches.
+- Changed code generation for Hex output to properly handle float values.
+
+### Fixed
+- Updated printinput nodetemplate for float handling.
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
@@ -40,13 +40,16 @@
     accessStr += "[" + f"print_iter_{idx}" + "]"
     if idx > 0:
         dimStr += "[" + f"{dim}" + "]"
+formatSpecifier = "%*i" 
+if "float" in bufferType.referencedType.typeName or "double" in bufferType.referencedType.typeName:
+    formatSpecifier = "%*.6f"  
 %>
 printf("${nodeName} ${bufferName}: ${bufferType.referencedType.typeName}, ${bufferShape}, %p\\n", ${bufferName});
 % for idx, dim in enumerate(bufferShape):
 printf("[");
 for (int print_iter_${idx}=0; print_iter_${idx} < ${dim}; print_iter_${idx}++){
 % endfor
-printf("%*i,", 4, ((${bufferType.referencedType.typeName} (*)${dimStr})${bufferName})${accessStr});
+printf("${formatSpecifier},", 4, ((${bufferType.referencedType.typeName} (*)${dimStr})${bufferName})${accessStr});
 % for dim in bufferShape:
 }
 printf("], \\n");
@@ -214,8 +217,11 @@ def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
 
 class MemoryAwarePrintConstantGeneration(MemoryAwareGeneration, PrintConstantGeneration):
 
-    def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
-              name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
 
         references = self.extractDynamicReferences(ctxt, executionBlock, True)
 

diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
@@ -70,7 +70,8 @@ class CodeGenVerbosity:
     """
 
     tilingProfiling: Optional[str]  #: str: Specifies the name of the memory level on which to profile tiling
-    untiledProfiling: Optional[bool] = None  #: str: Specifies the name of the memory level on which to profile untiling
+    untiledProfiling: Optional[
+        bool] = None  #: str: Specifies the name of the memory level on which to profile untiled code
 
 
 _NoVerbosity = CodeGenVerbosity(None)

diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -38,19 +38,20 @@
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
 from Deeploy.Targets.Generic.Templates import ConcatTemplate, FloatGELUTemplate, FloatGemmTemplate, \
-    FloatLayernormTemplate, FloatMatMulTemplate, FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \
-    GatherTemplate, RQSiGELUTemplate, iHardswishTemplate
+    FloatLayernormTemplate, FloatMatMulTemplate, FloatMulTemplate, FloatReluTemplate, GatherTemplate, \
+    RQSiGELUTemplate, iHardswishTemplate
 from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, ConvChecker, GatherChecker, GELUChecker, GEMMChecker, \
     HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, ReduceMeanChecker, ReluChecker, RQAddChecker, \
     RQHardswishChecker, SliceChecker, SoftmaxChecker, TransposeChecker
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
-from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatConvTemplate, FloatMaxPoolTemplate, GEMMTemplate, \
-    MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, RQAddTemplate, \
-    RQSiHardswishTemplate, SliceTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
-    iRMSNormTemplate, iSoftmaxTemplate
+from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatConvTemplate, FloatMaxPoolTemplate, \
+    FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, \
+    RequantShiftTemplate, RQAddTemplate, RQSiHardswishTemplate, SliceTemplate, TallGEMMTemplate, TransposeTemplate, \
+    UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
@@ -118,6 +119,7 @@
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
     TilingVariableReplacement("L2"),
     PULPL3Tiling("L2"),
+    PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
     MemoryManagementGeneration("L3.*"),
@@ -134,6 +136,7 @@
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
     TilingVariableReplacement("L2"),
     PULPL3Tiling("L2"),
+    PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
     MemoryManagementGeneration("L2"),

diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
@@ -25,7 +25,6 @@
 
 from typing import Tuple
 
-from Deeploy.CommonExtensions.CodeTransformationPasses.CycleMeasurement import ProfilingCodeGeneration
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
 
 from .PULPClusterTilingDB import ProfilingPULPClusterTilingGenerationDB, PULPClusterTilingGenerationDB
@@ -39,7 +38,6 @@ def __init__(self, targetMemLevel: str):
         self.profilingSB = ProfilingPULPClusterTilingGenerationSB(targetMemLevel)
         self.DB = PULPClusterTilingGenerationDB(targetMemLevel)
         self.profilingDB = ProfilingPULPClusterTilingGenerationDB(targetMemLevel)
-        self.profiluntiling = ProfilingCodeGeneration()
 
     def apply(self,
               ctxt: NetworkContext,
@@ -54,7 +52,4 @@ def apply(self,
             ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
             ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
 
-        if verbose.untilingProfiling:
-            ctxt, executionBlock = self.profiluntiling.apply(ctxt, executionBlock, name)
-
         return ctxt, executionBlock
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py
@@ -0,0 +1,34 @@
+# ----------------------------------------------------------------------
+#
+# File: FloatSoftmaxTemplate.py
+#
+# Last edited: 23.1.2025
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Softmax (Name: ${nodeName}, Op: ${nodeOp})
+int8_t ${nodeName}_core_id = pi_core_id();
+if (${nodeName}_core_id == 0) {
+    Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength});
+}
+""")
diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytest.c b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
@@ -114,7 +114,7 @@ void main(void) {
       diff = expected - actual;
       if (ISOUTPUTFLOAT)
       {
-        if ((diff < -1e-4) || (diff > 1e-4))
+        if ((diff < -1e-4) || (diff > 1e-4) || (isnan(diff)))
         {
           tot_err += 1;
           printf("Expected: %10.6f  ", expected);

diff --git a/DeeployTest/Tests/CCT/CCT_16_16_8/inputs.npz b/DeeployTest/Tests/CCT/CCT_16_16_8/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_16_16_8/network.onnx b/DeeployTest/Tests/CCT/CCT_16_16_8/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_128/inputs.npz b/DeeployTest/Tests/CCT/CCT_1_16_16_128/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_128/network.onnx b/DeeployTest/Tests/CCT/CCT_1_16_16_128/network.onnx
diff --git a/...loyTest/Tests/CCT/CCT_32_32_8/outputs.npz → ...est/Tests/CCT/CCT_1_16_16_128/outputs.npz b/...loyTest/Tests/CCT/CCT_32_32_8/outputs.npz → ...est/Tests/CCT/CCT_1_16_16_128/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_32/inputs.npz b/DeeployTest/Tests/CCT/CCT_1_16_16_32/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_32/network.onnx b/DeeployTest/Tests/CCT/CCT_1_16_16_32/network.onnx
diff --git a/...loyTest/Tests/CCT/CCT_16_16_8/outputs.npz → ...Test/Tests/CCT/CCT_1_16_16_32/outputs.npz b/...loyTest/Tests/CCT/CCT_16_16_8/outputs.npz → ...Test/Tests/CCT/CCT_1_16_16_32/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_64/inputs.npz b/DeeployTest/Tests/CCT/CCT_1_16_16_64/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_64/network.onnx b/DeeployTest/Tests/CCT/CCT_1_16_16_64/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_64/outputs.npz b/DeeployTest/Tests/CCT/CCT_1_16_16_64/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_8/inputs.npz b/DeeployTest/Tests/CCT/CCT_1_16_16_8/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_8/network.onnx b/DeeployTest/Tests/CCT/CCT_1_16_16_8/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_8/outputs.npz b/DeeployTest/Tests/CCT/CCT_1_16_16_8/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_32/inputs.npz b/DeeployTest/Tests/CCT/CCT_1_32_32_32/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_32/network.onnx b/DeeployTest/Tests/CCT/CCT_1_32_32_32/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_32/outputs.npz b/DeeployTest/Tests/CCT/CCT_1_32_32_32/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_8/inputs.npz b/DeeployTest/Tests/CCT/CCT_1_32_32_8/inputs.npz
diff --git a/...oyTest/Tests/CCT/CCT_32_32_8/network.onnx → ...Test/Tests/CCT/CCT_1_32_32_8/network.onnx b/...oyTest/Tests/CCT/CCT_32_32_8/network.onnx → ...Test/Tests/CCT/CCT_1_32_32_8/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_8/outputs.npz b/DeeployTest/Tests/CCT/CCT_1_32_32_8/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_32_32_8/inputs.npz b/DeeployTest/Tests/CCT/CCT_32_32_8/inputs.npz
diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py
@@ -55,6 +55,11 @@
     parser.add_argument('--overwriteRecentState',
                         action = 'store_true',
                         help = 'Copy the recent deeply state to the ./deeployStates folder\n')
+    parser.add_argument('--profileUntiled',
+                        action = 'store_true',
+                        dest = 'profileUntiled',
+                        default = False,
+                        help = 'Profile Untiled for L2\n')
 
     args = parser.parse_args()
 
@@ -105,6 +110,10 @@
     ) and not "simpleCNN" in args.dir and not "testRQMatMul" in args.dir and not "testRQGEMM" in args.dir:
         deployer.loweringOptimizer.passes.insert(0, EmulateCMSISRequantPass())
 
+    verbosityCfg = _NoVerbosity
+    if isinstance(platform, PULPPlatform):
+        verbosityCfg.untiledProfiling = args.profileUntiled
+
     # Parse graph and infer output levels and signedness
     _ = deployer.generateFunction(verbose = verbosityCfg)
 
@@ -149,4 +158,4 @@
         print("=" * 80)
         print()
         print(f"{'Number of Ops:' :<{_TEXT_ALIGN}} {num_ops}")
-        print(f"{'Model Parameters: ' :<{_TEXT_ALIGN}} {deployer.getParameterSize()}")
+        print(f"{'Model Parameters: ' :<{_TEXT_ALIGN}} {deployer.getParameterSize()}")
diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py
@@ -249,17 +249,21 @@ def generateTestNetworkImplementation(deployer: NetworkDeployer,
 def generateL3HexDump(deployer: NetworkDeployer, path: str, test_inputs: List, test_outputs: List):
 
     def type2TypeStr(dataType) -> Tuple[str, int]:
-        width = dataType.referencedType.typeWidth
-        signed = (dataType.referencedType.typeMin < 0)
+        if dataType.referencedType.typeName == "float32_t":
+            retStr = "float32"
+            width = 32
+        else:
+            width = dataType.referencedType.typeWidth
+            signed = (dataType.referencedType.typeMin < 0)
 
-        retStr = ""
+            retStr = ""
 
-        if signed:
-            retStr += "int"
-        else:
-            retStr += "uint"
+            if signed:
+                retStr += "int"
+            else:
+                retStr += "uint"
 
-        retStr += str(width)
+            retStr += str(width)
 
         return retStr, width
 

diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
@@ -305,7 +305,6 @@ def generate_test(self):
 
         command = f"python {generation_script} -d {self._dir_gen} -t {self._dir_test} -p {self._platform} {self.gen_args}"
         command += self._argument_parser.generate_cmd_args()
-        print(command)
 
         if self._args.verbose >= 2:
             prBlue(f"[TestRunner] Generation Command: {command}")