Skip to content

Commit

Permalink
Add Fp gemm and Softmax for Snitch platform (#31)
Browse files Browse the repository at this point in the history
* add fp gemm and softmax

* add new test for gemm_fp32_transb

* add new test in CI.yml

* increase size of the error between expected and actual

* update CHANGELOG
  • Loading branch information
tahaelbayad authored Feb 11, 2025
1 parent 6758edc commit feff1ef
Show file tree
Hide file tree
Showing 17 changed files with 477 additions and 22 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ jobs:
testRQGEMM
TestRQAdd
testRQGEMMTransB
testFloatSoftmax
num-cores: 9
simulators: |
banshee
Expand All @@ -141,9 +142,24 @@ jobs:
"name": "testRQGEMM",
"L1": [2000, 5000]
},
{
"name": "testFloatSoftmax",
"L1": [2000, 5000, 10000]
},
{
"name": "TestRQAdd",
"L1": [5000, 10000]
},
{
"name": "testFloatGEMM",
"L1": [2000, 5000, 10000]
},
{
"name": "testFloatGEMMtransB",
"L1": [2000, 5000, 10000]
}
]
simulators: |
Expand Down
15 changes: 14 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,17 @@ Change main.c to use OUTPUTTYPE instead of float

### Fixed
- MaxPool Padding Extract Pass for float and interger
- Testinput, testoutput, weight type casted from double to float warning
- Testinput, testoutput, weight type casted from double to float warning

## Add Float GEMM and Softmax for Snitch platform

### Added
- New templates for GEMM and Softmax.
- Added GEMM and Softmax to TargetLibraries, including case for GEMM with a transposed B matrix.
- Added new CI tests for GEMM and Softmax.

### Changed
- Adapted snitch Bindings and Platform files.

### Fixed
- Relaxed the error threshold between expected and actual values in deeploytest.
3 changes: 2 additions & 1 deletion CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ All contributors have agreed to an open-source release of their work in the Deep
* Luka Macan
* Alberto Dequino
* Francesco Conti
* Run Wang
* Run Wang
* Taha El Bayad
1 change: 0 additions & 1 deletion Deeploy/Targets/Generic/Parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,6 @@ def __init__(self):
def parseNode(self, node: gs.Node) -> bool:

ret = all([len(node.inputs) == 1, len(node.outputs) == 1])

return ret

def parseNodeCtxt(self,
Expand Down
14 changes: 12 additions & 2 deletions Deeploy/Targets/Snitch/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,15 @@
from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
MemoryManagementGeneration
from Deeploy.CommonExtensions.DataTypes import int8_t, int32_t, uint8_t
from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
from Deeploy.Targets.Generic.Templates import iNoNormTemplate
from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker
from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \
SnitchProfileExecutionBlockPass, SnitchSynchCoresPass
from Deeploy.Targets.Snitch.Templates import AddTemplate, RQAddTemplate, iSoftmaxTemplate
from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate
from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template
from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template
from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template
from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
Expand Down Expand Up @@ -69,7 +70,11 @@
SnitchiSoftmaxBindings = [
NodeBinding(SoftmaxChecker([PointerClass(_type)], [PointerClass(uint8_t)]), iSoftmaxTemplate.referenceTemplate,
TiledTransformer) for _type in [int8_t, uint8_t]
] + [
NodeBinding(SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatSoftmax_Template,
TiledTransformer)
]

SnitchiNoNormBindings = [
NodeBinding(
iNoNormChecker([PointerClass(_type), PointerClass(int8_t),
Expand All @@ -88,6 +93,11 @@
NodeBinding(
GEMMChecker([PointerClass(int8_t), PointerClass(int8_t),
PointerClass(int32_t)], [PointerClass(int32_t)]), SnitchGemm_Template, TiledTransformer)
] + [
NodeBinding(
GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
PointerClass(float32_t)], [PointerClass(float32_t)]), FloatGemmTemplate.referenceTemplate,
TiledTransformer)
]
SnitchRqGemmBindings = [
NodeBinding(
Expand Down
4 changes: 3 additions & 1 deletion Deeploy/Targets/Snitch/Platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \
ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer
from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \
RQAddParser, RQIntegerDivParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser
RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser
from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \
IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \
Expand All @@ -58,6 +58,7 @@
GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings)
RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings)
iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings)
iLayerNormMapper = NodeMapper(iLayerNormParser(), BasicLayerNormBindings)
RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings)
Expand All @@ -72,6 +73,7 @@
'Gemm': GEMMLayer([GemmMapper]),
'RQGemm': RQGEMMLayer([RqGemmMapper]),
'iSoftmax': SoftmaxLayer([iSoftmaxMapper]),
'Softmax': SoftmaxLayer([SoftmaxMapper]),
'iNoNorm': iNoNormLayer([iNoNormMapper]),
'iLayerNorm': LayerNormLayer([iLayerNormMapper]),
'RequantizedAdd': AddLayer([RQAddMapper]),
Expand Down
11 changes: 11 additions & 0 deletions Deeploy/Targets/Snitch/Templates/FloatGemmTemplate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from Deeploy.DeeployTypes import NodeTemplate

referenceTemplate = NodeTemplate("""
uint32_t compute_num = snrt_cluster_compute_core_num();
% if transB:
gemm_fp32_transB_opt(${M} / compute_num, ${O}, ${N}, ${A}, ${N} * compute_num, ${B}, ${N}, ${C}, ${O} * compute_num, ${data_out}, 1, 1 );
% else:
gemm_fp32_opt(${M} / compute_num, ${O}, ${N}, ${A}, ${N} * compute_num, ${B}, ${O}, ${C}, ${O} * compute_num, ${data_out}, 1, 1 );
%endif
""")
57 changes: 57 additions & 0 deletions Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# ----------------------------------------------------------------------
#
# File: iSoftmaxTemplate.py
#
# Last edited: 30.05.2024
#
# Copyright (C) 2024, ETH Zurich and University of Bologna.
#
# Author:
# - Victor Jung, [email protected], ETH Zurich
#
# ----------------------------------------------------------------------
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Tuple

from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation


class FloatSoftmaxTemplate(NodeTemplate):

def __init__(self, templateStr):
super().__init__(templateStr)

def alignToContext(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:

data_in = ctxt.lookup(operatorRepresentation["data_in"])
operatorRepresentation["seq_len"] = data_in.shape[2]
operatorRepresentation["input_samples"] = data_in.shape[-1]

operatorRepresentation["kernelName"] = "Softmax_fp32"

return ctxt, operatorRepresentation, []


FloatSoftmaxTemplateStr = r"""
uint32_t batch_size = ${size} / ${lastDimLength};
uint32_t compute_num = snrt_cluster_compute_core_num();
int32_t ldI = compute_num * ${input_samples};
int32_t batch_offset = ${seq_len} * ${input_samples};
${kernelName}(${data_in}, ${data_out}, ldI, batch_offset, batch_size, ${seq_len}, ${input_samples});
"""

FloatSoftmax_Template = FloatSoftmaxTemplate(FloatSoftmaxTemplateStr)
2 changes: 1 addition & 1 deletion DeeployTest/Platforms/Siracusa/src/deeploytest.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,4 @@ void main(void) {

printf("Runtime: %u cycles\r\n", getCycles());
printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
}
}
53 changes: 38 additions & 15 deletions DeeployTest/Platforms/Snitch/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,23 +132,46 @@ int main(void) {
#ifndef NOTEST
int32_t tot_err = 0;
uint32_t tot = 0;
int32_t diff;
int32_t expected, actual;
for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {

tot += DeeployNetwork_outputs_bytes[buf];
for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
expected = ((char *)testOutputVector[buf])[i];
actual = ((char *)DeeployNetwork_outputs[buf])[i];
diff = expected - actual;

if (diff) {
tot_err += 1;
if (ISOUTPUTFLOAT) {
float32_t diff;
float32_t expected, actual;
for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {

tot += DeeployNetwork_outputs_bytes[buf] / sizeof(float32_t);
for (uint32_t i = 0;
i < DeeployNetwork_outputs_bytes[buf] / sizeof(float32_t); i++) {
expected = ((float32_t *)testOutputVector[buf])[i];
actual = ((float32_t *)DeeployNetwork_outputs[buf])[i];
diff = expected - actual;

if (diff < -1.2e-5 || diff > 1.2e-5) {
tot_err += 1;
#ifndef CI
printf("Expected: %f ", expected);
printf("Actual: %f ", actual);
printf("Diff: %f at Index %12u in Output %u\r\n", diff, i, buf);
#endif
}
}
}
} else {
int32_t diff;
int32_t expected, actual;
for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {

tot += DeeployNetwork_outputs_bytes[buf];
for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
expected = ((char *)testOutputVector[buf])[i];
actual = ((char *)DeeployNetwork_outputs[buf])[i];
diff = expected - actual;
if (diff) {
tot_err += 1;
#ifndef CI
printf("Expected: %4d ", expected);
printf("Actual: %4d ", actual);
printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
printf("Expected: %4d ", expected);
printf("Actual: %4d ", actual);
printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
#endif
}
}
}
}
Expand Down
Binary file added DeeployTest/Tests/testFloatGEMMtransB/inputs.npz
Binary file not shown.
Binary file not shown.
Binary file added DeeployTest/Tests/testFloatGEMMtransB/outputs.npz
Binary file not shown.
51 changes: 51 additions & 0 deletions TargetLibraries/Snitch/inc/kernel/Gemm_fp32.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#ifndef __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
#define __DEEPLOY_MATH_GEMM_KERNEL_HEADER_

#include "DeeploySnitchMath.h"

/*
* TILING ONLY due to ssr loop
*
*
*
* FP32 GEMM with the following format:
* A is an M x K matrix, B is a K x N matrix, and C is a M x N matrix
*
* A' = transpose(A) if transA else A
* B' = transpose(B) if transB else B
*
* Y = A' * B' + C
*
*/

/*
*
* transposed A = no
* transposed B = yes
* multi-core = yes
* unrolling = yes
* simd = yes
* parallelization = row-wise
*/

void gemm_fp32_transB_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
uint32_t ldA, float32_t *B, uint32_t ldB,
float32_t *C, uint32_t ldC, float32_t *Y,
uint32_t BETA, uint32_t setup_SSR);

/*
*
* transposed A = no
* transposed B = no
* multi-core = yes
* unrolling = yes
* simd = yes
* parallelization = row-wise
*/

void gemm_fp32_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
uint32_t ldA, float32_t *B, uint32_t ldB, float32_t *C,
uint32_t ldC, float32_t *Y, uint32_t BETA,
uint32_t setup_SSR);

#endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_
5 changes: 5 additions & 0 deletions TargetLibraries/Snitch/inc/kernel/Softmax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "DeeploySnitchMath.h"

void softmax_fp32(float *input, float *output, int32_t ldI,
int32_t batch_offset, int32_t batch_size, int32_t seq_len,
int32_t input_samples);
Loading

0 comments on commit feff1ef

Please sign in to comment.