From 85e7777d4eee25cbeac9c8c8654022480992e8f1 Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Thu, 6 Feb 2025 13:54:07 -0800 Subject: [PATCH 1/9] good tests for narrower m,n and overflow --- lib/src/arithmetic/float_to_fixed.dart | 90 +++++++-- .../arithmetic/signals/fixed_point_logic.dart | 4 + .../arithmetic/values/fixed_point_value.dart | 22 ++- test/arithmetic/fixed_to_float_test.dart | 28 +++ test/arithmetic/float_to_fixed_test.dart | 177 ++++++++++++++++++ .../values/fixed_point_value_test.dart | 26 +++ 6 files changed, 328 insertions(+), 19 deletions(-) diff --git a/lib/src/arithmetic/float_to_fixed.dart b/lib/src/arithmetic/float_to_fixed.dart index 43cfe7fa7..9967dd19e 100644 --- a/lib/src/arithmetic/float_to_fixed.dart +++ b/lib/src/arithmetic/float_to_fixed.dart @@ -7,6 +7,8 @@ // 2024 November 1 // Author: Soner Yaldiz +import 'dart:math'; + import 'package:rohd/rohd.dart'; import 'package:rohd_hcl/rohd_hcl.dart'; @@ -24,14 +26,24 @@ class FloatToFixed extends Module { /// Width of output fractional part. late final int n; + /// Return true if the conversion overflowed. + Logic? get overflow => tryOutput('overflow'); + /// Internal representation of the output port late final FixedPoint _fixed = FixedPoint(signed: true, m: m, n: n); /// Output fixed point port late final FixedPoint fixed = _fixed.clone()..gets(output('fixed')); - /// Constructor - FloatToFixed(FloatingPoint float, {super.name = 'FloatToFixed'}) + /// Build a [FloatingPoint] to [FixedPoint] converter. + /// - if [m] and [n] are supplied, an m.n fixed-point output will be produced. + /// Otherwise, the converter will compute a lossless size for [m] and [n] for + /// outputing the floating-point value into a fixed-point value. + /// - [checkOverflow] set to true will cause overflow detection to happen in + /// case that loss can occur and an optional output [overflow] will be + /// produced that returns true when overflow occurs. + FloatToFixed(FloatingPoint float, + {super.name = 'FloatToFixed', int? m, int? n, bool checkOverflow = false}) : super( definitionName: 'FloatE${float.exponent.width}' 'M${float.mantissa.width}ToFixed') { @@ -39,24 +51,66 @@ class FloatToFixed extends Module { final bias = FloatingPointValue.computeBias(float.exponent.width); // E4M3 expands the max exponent by 1. - m = ((float.exponent.width == 4) & (float.mantissa.width == 3)) - ? bias + 1 - : bias; - n = bias + float.mantissa.width - 1; - final outputWidth = m + n + 1; + final noLossM = ((float.exponent.width == 4) & (float.mantissa.width == 3)) + ? bias + 2 + : bias + 1; // accomodate the jbit + final noLossN = bias + float.mantissa.width - 1; + + this.m = m ?? noLossM; + this.n = n ?? noLossN; + final outputWidth = this.m + this.n + 1; final jBit = Logic(name: 'jBit')..gets(float.isNormal); - final shift = Logic(name: 'shift', width: float.exponent.width) - ..gets( - mux(jBit, float.exponent - 1, Const(0, width: float.exponent.width))); - - final number = Logic(name: 'number', width: outputWidth) - ..gets([ - Const(0, width: outputWidth - float.mantissa.width - 1), - jBit, - float.mantissa - ].swizzle() << - shift); + final fullMantissa = [jBit, float.mantissa].swizzle().named('fullMantissa'); + + final eWidth = max(log2Ceil(this.n + this.m), float.exponent.width) + 1; + final shift = Logic(name: 'shift', width: eWidth); + final exp = (float.exponent - 1).zeroExtend(eWidth); + + if (this.n > noLossN) { + shift <= + mux(jBit, exp, Const(0, width: eWidth)) + + Const(this.n - noLossN, width: eWidth); + } else if (this.n == noLossN) { + shift <= mux(jBit, exp, Const(0, width: eWidth)); + } else { + shift <= + mux(jBit, exp, Const(0, width: eWidth)) - + Const(noLossN - this.n, width: eWidth); + } + + if (checkOverflow & ((this.m < noLossM) | (this.n < noLossN))) { + final overFlow = Logic(name: 'overflow'); + final leadDetect = ParallelPrefixPriorityEncoder(fullMantissa.reversed); + + final sWidth = max(eWidth, leadDetect.out.width); + final fShift = shift.zeroExtend(sWidth); + final leadOne = leadDetect.out.zeroExtend(sWidth); + + Combinational([ + If(jBit, then: [ + overFlow < shift.gte(outputWidth - float.mantissa.width - 1), + ], orElse: [ + If(fShift.gt(leadOne), then: [ + overFlow < + (fShift - leadOne).gte(outputWidth - float.mantissa.width - 1), + ], orElse: [ + overFlow < Const(0), + ]), + ]), + ]); + addOutput('overflow') <= overFlow; + } + final preNumber = (outputWidth >= fullMantissa.width) + ? fullMantissa.zeroExtend(outputWidth) + : fullMantissa.slice(-1, fullMantissa.width - outputWidth); + // TODO(desmonddak): Rounder is needed when shift is negative, + // LSB(fullMantissa) = shiftRight + final shiftRight = ((fullMantissa.width > outputWidth) + ? (~shift + 1) - (fullMantissa.width - outputWidth) + : (~shift + 1)); + + final number = mux(shift[-1], preNumber >>> shiftRight, preNumber << shift); _fixed <= mux(float.sign, ~number + 1, number); addOutput('fixed', width: outputWidth) <= _fixed; diff --git a/lib/src/arithmetic/signals/fixed_point_logic.dart b/lib/src/arithmetic/signals/fixed_point_logic.dart index cf503b8cb..eea4d52f3 100644 --- a/lib/src/arithmetic/signals/fixed_point_logic.dart +++ b/lib/src/arithmetic/signals/fixed_point_logic.dart @@ -42,6 +42,10 @@ class FixedPoint extends Logic { } } + /// Retrieve the [FixedPointValue] of this [FixedPoint] logical signal. + FixedPointValue get fixedPointValue => + FixedPointValue(value: value, signed: signed, m: m, n: n); + /// Clone for I/O ports. @override FixedPoint clone({String? name}) => FixedPoint(signed: signed, m: m, n: n); diff --git a/lib/src/arithmetic/values/fixed_point_value.dart b/lib/src/arithmetic/values/fixed_point_value.dart index f4a22e4fe..2b64e8dcd 100644 --- a/lib/src/arithmetic/values/fixed_point_value.dart +++ b/lib/src/arithmetic/values/fixed_point_value.dart @@ -143,7 +143,15 @@ class FixedPointValue implements Comparable { return compareTo(other) == 0; } - /// Constructs [FixedPointValue] of a Dart [double] rounding away from zero. + /// Return a string representation of FloatingPointValue. + /// return sign, exponent, mantissa as binary strings. + @override + String toString() => + "(${signed ? '${value[-1].toString(includeWidth: false)} ' : ''}" + "${(m > 0) ? '${value.slice(m + n - 1, n).bitString} ' : ''}" + '${value.slice(n - 1, 0).toString(includeWidth: false)})'; + + /// Constructs [FixedPointValue] from a Dart [double] rounding away from zero. factory FixedPointValue.ofDouble(double val, {required bool signed, required int m, required int n}) { if (!signed & (val < 0)) { @@ -155,6 +163,18 @@ class FixedPointValue implements Comparable { return FixedPointValue(value: v, signed: signed, m: m, n: n); } + /// Constructs [FixedPointValue] from a Dart [double] without rounding. + factory FixedPointValue.ofDoubleUnrounded(double val, + {required bool signed, required int m, required int n}) { + if (!signed & (val < 0)) { + throw RohdHclException('Negative input not allowed with unsigned'); + } + final integerValue = (val * pow(2, n + 1)).toInt(); + final w = signed ? 1 + m + n : m + n; + final v = LogicValue.ofInt(integerValue >> 1, w); + return FixedPointValue(value: v, signed: signed, m: m, n: n); + } + /// Converts a fixed-point value to a Dart [double]. double toDouble() { if (m + n > 52) { diff --git a/test/arithmetic/fixed_to_float_test.dart b/test/arithmetic/fixed_to_float_test.dart index 8564a9174..81ab9a8f4 100644 --- a/test/arithmetic/fixed_to_float_test.dart +++ b/test/arithmetic/fixed_to_float_test.dart @@ -29,6 +29,34 @@ void main() async { reason: 'mantissa mismatch'); }); + test('FixedToFloat: exhaustive', () async { + final fixed = FixedPoint(signed: true, m: 8, n: 8); + final dut = FixedToFloat(fixed, exponentWidth: 8, mantissaWidth: 16); + await dut.build(); + for (var val = 0; val < pow(2, fixed.width); val++) { + final fixedValue = FixedPointValue( + value: LogicValue.ofInt(val, fixed.width), + signed: true, + m: fixed.m, + n: fixed.n); + fixed.put(fixedValue); + final fpv = dut.float.floatingPointValue; + final fpvExpected = FloatingPointValue.ofDouble(fixedValue.toDouble(), + exponentWidth: dut.exponentWidth, mantissaWidth: dut.mantissaWidth); + final newFixed = FixedPointValue.ofDouble(fpv.toDouble(), + signed: true, m: fixed.m, n: fixed.n); + expect(newFixed, equals(fixedValue), reason: ''' + fpvdbl=${fpv.toDouble()} $fpv + ${newFixed.toDouble()} $newFixed + ${fixedValue.toDouble()} $fixedValue + ${fixed.fixedPointValue.toDouble()} ${fixed.fixedPointValue} +'''); + expect(fpv.sign, fpvExpected.sign); + expect(fpv.exponent, fpvExpected.exponent, reason: 'exponent'); + expect(fpv.mantissa, fpvExpected.mantissa, reason: 'mantissa'); + } + }); + test('Q16.16 to E5M2 < pow(2,14)', () async { final fixed = FixedPoint(signed: true, m: 16, n: 16); final dut = FixedToFloat(fixed, exponentWidth: 5, mantissaWidth: 2); diff --git a/test/arithmetic/float_to_fixed_test.dart b/test/arithmetic/float_to_fixed_test.dart index aad8008ba..662c5ce98 100644 --- a/test/arithmetic/float_to_fixed_test.dart +++ b/test/arithmetic/float_to_fixed_test.dart @@ -30,6 +30,170 @@ void main() async { } }); + test('FloatToFixed: exhaustive lossless round-trip fp-fx-fp', () { + for (var sEW = 2; sEW < 6; sEW++) { + for (var sMW = 2; sMW < 7; sMW++) { + final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) + ..put(0); + final convert = FloatToFixed(fp1); + for (final negate in [false, true]) { + for (var e1 = 0; e1 < pow(2, sEW) - 1; e1++) { + for (var m1 = 0; m1 < pow(2, sMW); m1++) { + final fv1 = FloatingPointValue.ofInts(e1, m1, + exponentWidth: sEW, mantissaWidth: sMW, sign: negate); + fp1.put(fv1.value); + final fx2 = convert.fixed; + final dbl = fx2.fixedPointValue.toDouble(); + final dbl2 = fv1.toDouble(); + expect(dbl, equals(dbl2)); + } + } + } + } + } + }); + + // TODO(desmonddak): float-to-fixed is limited by e=6 by toDouble() + test('FloatToFixed: exhaustive round-trip fp->smallerfx fpv->xpv', () { + for (var sEW = 2; sEW < 5; sEW++) { + print('sEW=$sEW'); + for (var sMW = 2; sMW < 5; sMW++) { + final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) + ..put(0); + final nominal = FloatToFixed(fp1); + for (var i = 0; i < nominal.n - 2; i++) { + final tN = nominal.n - i; + for (var j = 0; j < nominal.m - 2; j++) { + final tM = nominal.m - j; + final convert = FloatToFixed(fp1, m: tM, n: tN); + final fxc = convert.fixed; + for (final negate in [false, true]) { + for (var e1 = 0; e1 < pow(2, sEW) - 1; e1++) { + for (var m1 = 0; m1 < pow(2, sMW); m1++) { + final fv1 = FloatingPointValue.ofInts(e1, m1, + exponentWidth: sEW, mantissaWidth: sMW, sign: negate); + fp1.put(fv1.value); + final fx = FixedPointValue.ofDouble(fv1.toDouble(), + signed: true, m: tM, n: tN); + expect(fxc.fixedPointValue, equals(fx), reason: ''' + $fx (${fx.toDouble()}) + ${fxc.fixedPointValue} (${fxc.fixedPointValue.toDouble()}) + $fv1 (${fv1.toDouble()}) + sEW=$sEW + sMW=$sMW + e1=$e1 + m1=$m1 + m=$tM + n=$tN + negate=$negate +'''); + } + } + } + } + } + } + } + }); + // TODO(desmonddak): we use rounding to avoid problems with negative + // numbers, but we don't have any rounding code so this may end up + // with some problems in other corner cases. + test('FloatToFixed: exhaustive round-trip fp->smaller_n fpv->xpv', () { + for (var sEW = 2; sEW < 5; sEW++) { + for (var sMW = 2; sMW < 6; sMW++) { + final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) + ..put(0); + final nominal = FloatToFixed(fp1); + for (var i = 0; i < nominal.n - 2; i++) { + final tN = nominal.n - i; + final tM = nominal.m; + final convert = FloatToFixed(fp1, m: tM, n: tN); + for (final negate in [false, true]) { + for (var e1 = 0; e1 < pow(2, sEW) - 1; e1++) { + for (var m1 = 0; m1 < pow(2, sMW); m1++) { + final fv1 = FloatingPointValue.ofInts(e1, m1, + exponentWidth: sEW, mantissaWidth: sMW, sign: negate); + fp1.put(fv1.value); + final fxc = convert.fixed; + + final fx = FixedPointValue.ofDouble(fv1.toDouble(), + signed: true, m: tM, n: tN); + + expect(fxc.fixedPointValue, equals(fx), reason: ''' + $fx (${fx.toDouble()}) + ${fxc.fixedPointValue} (${fxc.fixedPointValue.toDouble()}) + $fv1 (${fv1.toDouble()}) + sEW=$sEW + sMW=$sMW + e1=$e1 + m1=$m1 + m=$tM + n=$tN + negate=$negate +'''); + } + } + } + } + } + } + }); + + test('FloatToFixed: exhaustive round-trip fp->smaller_m->fp', () { + for (var sEW = 2; sEW < 5; sEW++) { + for (var sMW = 2; sMW < 5; sMW++) { + final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) + ..put(0); + final nominal = FloatToFixed(fp1); + for (var i = 0; i < nominal.m - 2; i++) { + final tM = nominal.m - i; + final convert = + FloatToFixed(fp1, m: tM, n: nominal.n, checkOverflow: true); + for (final negate in [false, true]) { + for (var e1 = 0; e1 < pow(2, sEW) - 1; e1++) { + for (var m1 = 0; m1 < pow(2, sMW); m1++) { + final fv1 = FloatingPointValue.ofInts(e1, m1, + exponentWidth: sEW, mantissaWidth: sMW, sign: negate); + fp1.put(fv1.value); + final fx2 = convert.fixed; + final dbl = fx2.fixedPointValue.toDouble(); + final dbl2 = fv1.toDouble(); + if (convert.overflow != null) { + if (!convert.overflow!.value.toBool()) { + expect(dbl, equals(dbl2)); + } + } + } + } + } + } + } + } + }); + test('FloatToFixed: exhaustive round-trip fp->larger_fx->fp', () { + for (var sEW = 2; sEW < 6; sEW++) { + for (var sMW = 2; sMW < 7; sMW++) { + final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) + ..put(0); + final nominal = FloatToFixed(fp1); + final convert = FloatToFixed(fp1, m: nominal.m + 4, n: nominal.n + 2); + for (final negate in [false, true]) { + for (var e1 = 0; e1 < pow(2, sEW) - 1; e1++) { + for (var m1 = 0; m1 < pow(2, sMW); m1++) { + final fv1 = FloatingPointValue.ofInts(e1, m1, + exponentWidth: sEW, mantissaWidth: sMW, sign: negate); + fp1.put(fv1.value); + final fx2 = convert.fixed; + final dbl = fx2.fixedPointValue.toDouble(); + final dbl2 = fv1.toDouble(); + expect(dbl, equals(dbl2)); + } + } + } + } + } + }); + test('FP8toINT: exhaustive', () async { final float = Logic(width: 8); final mode = Logic(); @@ -65,3 +229,16 @@ void main() async { } }); } + +// Idea for testing lossy conversion: float to fixed: +// Converting a fixed to a larger float, then back to the fixed should result +// in no loss. While there is loss, it should not be seen in representable +// numbers. + +// TODO(desmonddak): write this test first and drive the lossy conversion. +// Should we test this first on the Value side? + +// Alternatively: fixed to float: +// if we use a larger than needed float and convert back to fixed, there +// should be no loss. +// TODO(desmonddak): Value first, then logic diff --git a/test/arithmetic/values/fixed_point_value_test.dart b/test/arithmetic/values/fixed_point_value_test.dart index f19a3c25c..d8d1cdaa1 100644 --- a/test/arithmetic/values/fixed_point_value_test.dart +++ b/test/arithmetic/values/fixed_point_value_test.dart @@ -174,6 +174,32 @@ void main() { LogicValue.one); }); + test('FixedPointValue: exhaustive double round-trip', () { + const width = 8; + const m = 3; + const n = 4; + for (var i = 0; i < pow(2, width); i++) { + final fxv = FixedPointValue( + value: LogicValue.ofInt(i, width), signed: true, m: m, n: n); + final dbl = fxv.toDouble(); + final fxv2 = FixedPointValue.ofDouble(dbl, signed: true, m: m, n: n); + expect(fxv, equals(fxv2)); + } + }); + + test('FixedPoint: Math singleton', () { + final fxp1 = FixedPointValue.ofDouble(signed: true, 0.25, m: 2, n: 3); + final fxp2 = FixedPointValue.ofDouble(signed: true, 0.25, m: 2, n: 3); + final exp = FixedPointValue.ofDouble(0.0625, signed: true, m: 5, n: 6); + + final fxp = fxp1 * fxp2; + print(fxp1); + print(fxp2); + print('exp = $exp (${exp.toDouble()})'); + print('fxp = $fxp (${fxp.toDouble()})'); + print('1=${fxp1.toDouble()} 2=${fxp2.toDouble()} p=${fxp.toDouble()}'); + }); + test('Math', () { const w = 4; FixedPointValue fxp; From 9b85659108d2cc1baaf61b0104f36b2cee795242 Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Thu, 6 Feb 2025 15:30:14 -0800 Subject: [PATCH 2/9] under test --- lib/src/arithmetic/float_to_fixed.dart | 3 + test/arithmetic/float_to_fixed_test.dart | 66 ++++++++++++++----- .../values/fixed_point_value_test.dart | 13 ---- 3 files changed, 54 insertions(+), 28 deletions(-) diff --git a/lib/src/arithmetic/float_to_fixed.dart b/lib/src/arithmetic/float_to_fixed.dart index 9967dd19e..5bf546a35 100644 --- a/lib/src/arithmetic/float_to_fixed.dart +++ b/lib/src/arithmetic/float_to_fixed.dart @@ -62,6 +62,7 @@ class FloatToFixed extends Module { final jBit = Logic(name: 'jBit')..gets(float.isNormal); final fullMantissa = [jBit, float.mantissa].swizzle().named('fullMantissa'); + print('fullMantissa: ${fullMantissa.value.bitString}'); final eWidth = max(log2Ceil(this.n + this.m), float.exponent.width) + 1; final shift = Logic(name: 'shift', width: eWidth); @@ -79,6 +80,8 @@ class FloatToFixed extends Module { Const(noLossN - this.n, width: eWidth); } + print('shift=${shift.value.toInt()}'); + if (checkOverflow & ((this.m < noLossM) | (this.n < noLossN))) { final overFlow = Logic(name: 'overflow'); final leadDetect = ParallelPrefixPriorityEncoder(fullMantissa.reversed); diff --git a/test/arithmetic/float_to_fixed_test.dart b/test/arithmetic/float_to_fixed_test.dart index 662c5ce98..6cdde8d8a 100644 --- a/test/arithmetic/float_to_fixed_test.dart +++ b/test/arithmetic/float_to_fixed_test.dart @@ -53,11 +53,42 @@ void main() async { } }); + test('FloatToFixed: singleton replication', () { + const sEW = 3; + const sMW = 11; + const e1 = 1; + const m1 = 646; + final fv1 = FloatingPointValue.ofInts(e1, m1, + exponentWidth: sEW, mantissaWidth: sMW); + final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) + ..put(fv1.value); + final nominal = FloatToFixed(fp1); + final tN = nominal.n - 9; + print('tN=$tN'); + const tM = 4; + final convert = FloatToFixed(fp1, m: tM, n: tN); + final fxc = convert.fixed; + + final fx = + FixedPointValue.ofDouble(fv1.toDouble(), signed: true, m: tM, n: tN); + + expect(fxc.fixedPointValue, equals(fx), reason: ''' + $fx (${fx.toDouble()}) + ${fxc.fixedPointValue} (${fxc.fixedPointValue.toDouble()}) + $fv1 (${fv1.toDouble()}) +'''); + }); + + // Failure: sEW=3 sMW=10 e1=0 m1=8 m=4 n=3 negate=false + // Oddly looks like a 1 was shifted into the sign position. + + // Failure: sEW=3 SMW=10 e1=0 m1=512, m=4, n=3, negate=false + // Could be a rounding issue as there is a 1 in the LSB only + // TODO(desmonddak): float-to-fixed is limited by e=6 by toDouble() test('FloatToFixed: exhaustive round-trip fp->smallerfx fpv->xpv', () { for (var sEW = 2; sEW < 5; sEW++) { - print('sEW=$sEW'); - for (var sMW = 2; sMW < 5; sMW++) { + for (var sMW = 2; sMW < 12; sMW++) { final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) ..put(0); final nominal = FloatToFixed(fp1); @@ -75,6 +106,24 @@ void main() async { fp1.put(fv1.value); final fx = FixedPointValue.ofDouble(fv1.toDouble(), signed: true, m: tM, n: tN); + if (fxc.fixedPointValue.value[-1] != fx.value[-1]) { + continue; + } + if (fxc.fixedPointValue != fx) { + print(''' + $fx (${fx.toDouble()}) + ${fxc.fixedPointValue} (${fxc.fixedPointValue.toDouble()}) + $fv1 (${fv1.toDouble()}) + sEW=$sEW + sMW=$sMW + e1=$e1 + m1=$m1 + m=$tM + n=$tN + negate=$negate +'''); + continue; + } expect(fxc.fixedPointValue, equals(fx), reason: ''' $fx (${fx.toDouble()}) ${fxc.fixedPointValue} (${fxc.fixedPointValue.toDouble()}) @@ -229,16 +278,3 @@ void main() async { } }); } - -// Idea for testing lossy conversion: float to fixed: -// Converting a fixed to a larger float, then back to the fixed should result -// in no loss. While there is loss, it should not be seen in representable -// numbers. - -// TODO(desmonddak): write this test first and drive the lossy conversion. -// Should we test this first on the Value side? - -// Alternatively: fixed to float: -// if we use a larger than needed float and convert back to fixed, there -// should be no loss. -// TODO(desmonddak): Value first, then logic diff --git a/test/arithmetic/values/fixed_point_value_test.dart b/test/arithmetic/values/fixed_point_value_test.dart index d8d1cdaa1..bb3dcac00 100644 --- a/test/arithmetic/values/fixed_point_value_test.dart +++ b/test/arithmetic/values/fixed_point_value_test.dart @@ -187,19 +187,6 @@ void main() { } }); - test('FixedPoint: Math singleton', () { - final fxp1 = FixedPointValue.ofDouble(signed: true, 0.25, m: 2, n: 3); - final fxp2 = FixedPointValue.ofDouble(signed: true, 0.25, m: 2, n: 3); - final exp = FixedPointValue.ofDouble(0.0625, signed: true, m: 5, n: 6); - - final fxp = fxp1 * fxp2; - print(fxp1); - print(fxp2); - print('exp = $exp (${exp.toDouble()})'); - print('fxp = $fxp (${fxp.toDouble()})'); - print('1=${fxp1.toDouble()} 2=${fxp2.toDouble()} p=${fxp.toDouble()}'); - }); - test('Math', () { const w = 4; FixedPointValue fxp; From 7216645d74ad831f6b4de9c4531a13f2c9cc2932 Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Thu, 6 Feb 2025 21:00:53 -0800 Subject: [PATCH 3/9] working float to fixed lossy --- lib/src/arithmetic/float_to_fixed.dart | 5 +- .../arithmetic/values/fixed_point_value.dart | 17 ++++++ test/arithmetic/float_to_fixed_test.dart | 61 +++---------------- .../values/fixed_point_value_test.dart | 4 ++ 4 files changed, 31 insertions(+), 56 deletions(-) diff --git a/lib/src/arithmetic/float_to_fixed.dart b/lib/src/arithmetic/float_to_fixed.dart index 5bf546a35..d83af9516 100644 --- a/lib/src/arithmetic/float_to_fixed.dart +++ b/lib/src/arithmetic/float_to_fixed.dart @@ -62,9 +62,8 @@ class FloatToFixed extends Module { final jBit = Logic(name: 'jBit')..gets(float.isNormal); final fullMantissa = [jBit, float.mantissa].swizzle().named('fullMantissa'); - print('fullMantissa: ${fullMantissa.value.bitString}'); - final eWidth = max(log2Ceil(this.n + this.m), float.exponent.width) + 1; + final eWidth = max(log2Ceil(this.n + this.m), float.exponent.width) + 2; final shift = Logic(name: 'shift', width: eWidth); final exp = (float.exponent - 1).zeroExtend(eWidth); @@ -80,8 +79,6 @@ class FloatToFixed extends Module { Const(noLossN - this.n, width: eWidth); } - print('shift=${shift.value.toInt()}'); - if (checkOverflow & ((this.m < noLossM) | (this.n < noLossN))) { final overFlow = Logic(name: 'overflow'); final leadDetect = ParallelPrefixPriorityEncoder(fullMantissa.reversed); diff --git a/lib/src/arithmetic/values/fixed_point_value.dart b/lib/src/arithmetic/values/fixed_point_value.dart index 2b64e8dcd..a6787b8b0 100644 --- a/lib/src/arithmetic/values/fixed_point_value.dart +++ b/lib/src/arithmetic/values/fixed_point_value.dart @@ -151,12 +151,29 @@ class FixedPointValue implements Comparable { "${(m > 0) ? '${value.slice(m + n - 1, n).bitString} ' : ''}" '${value.slice(n - 1, 0).toString(includeWidth: false)})'; + /// Return true if double [val] be stored in FixedPointValue with [m] and [n] + /// lengths. + static bool canStore(double val, + {required bool signed, required int m, required int n}) { + final w = signed ? 1 + m + n : m + n; + final bigIntegerValue = BigInt.from(val * pow(2, n)); + final negBigIntegerValue = BigInt.from(-val * pow(2, n)); + final l = (val < 0.0) + ? max(bigIntegerValue.bitLength, negBigIntegerValue.bitLength) + : bigIntegerValue.bitLength; + return l <= w; + } + /// Constructs [FixedPointValue] from a Dart [double] rounding away from zero. factory FixedPointValue.ofDouble(double val, {required bool signed, required int m, required int n}) { if (!signed & (val < 0)) { throw RohdHclException('Negative input not allowed with unsigned'); } + if (!canStore(val, signed: signed, m: m, n: n)) { + throw RohdHclException('Double is too long to store in ' + 'FixedPointValue: $m, $n'); + } final integerValue = (val * pow(2, n)).toInt(); final w = signed ? 1 + m + n : m + n; final v = LogicValue.ofInt(integerValue, w); diff --git a/test/arithmetic/float_to_fixed_test.dart b/test/arithmetic/float_to_fixed_test.dart index 6cdde8d8a..506ca968c 100644 --- a/test/arithmetic/float_to_fixed_test.dart +++ b/test/arithmetic/float_to_fixed_test.dart @@ -53,42 +53,10 @@ void main() async { } }); - test('FloatToFixed: singleton replication', () { - const sEW = 3; - const sMW = 11; - const e1 = 1; - const m1 = 646; - final fv1 = FloatingPointValue.ofInts(e1, m1, - exponentWidth: sEW, mantissaWidth: sMW); - final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) - ..put(fv1.value); - final nominal = FloatToFixed(fp1); - final tN = nominal.n - 9; - print('tN=$tN'); - const tM = 4; - final convert = FloatToFixed(fp1, m: tM, n: tN); - final fxc = convert.fixed; - - final fx = - FixedPointValue.ofDouble(fv1.toDouble(), signed: true, m: tM, n: tN); - - expect(fxc.fixedPointValue, equals(fx), reason: ''' - $fx (${fx.toDouble()}) - ${fxc.fixedPointValue} (${fxc.fixedPointValue.toDouble()}) - $fv1 (${fv1.toDouble()}) -'''); - }); - - // Failure: sEW=3 sMW=10 e1=0 m1=8 m=4 n=3 negate=false - // Oddly looks like a 1 was shifted into the sign position. - - // Failure: sEW=3 SMW=10 e1=0 m1=512, m=4, n=3, negate=false - // Could be a rounding issue as there is a 1 in the LSB only - // TODO(desmonddak): float-to-fixed is limited by e=6 by toDouble() test('FloatToFixed: exhaustive round-trip fp->smallerfx fpv->xpv', () { for (var sEW = 2; sEW < 5; sEW++) { - for (var sMW = 2; sMW < 12; sMW++) { + for (var sMW = 2; sMW < 6; sMW++) { final fp1 = FloatingPoint(exponentWidth: sEW, mantissaWidth: sMW) ..put(0); final nominal = FloatToFixed(fp1); @@ -104,13 +72,13 @@ void main() async { final fv1 = FloatingPointValue.ofInts(e1, m1, exponentWidth: sEW, mantissaWidth: sMW, sign: negate); fp1.put(fv1.value); - final fx = FixedPointValue.ofDouble(fv1.toDouble(), - signed: true, m: tM, n: tN); - if (fxc.fixedPointValue.value[-1] != fx.value[-1]) { - continue; - } - if (fxc.fixedPointValue != fx) { - print(''' + final val = fv1.toDouble(); + if (FixedPointValue.canStore(val, + signed: true, m: tM, n: tN)) { + final fx = FixedPointValue.ofDouble(fv1.toDouble(), + signed: true, m: tM, n: tN); + + expect(fxc.fixedPointValue, equals(fx), reason: ''' $fx (${fx.toDouble()}) ${fxc.fixedPointValue} (${fxc.fixedPointValue.toDouble()}) $fv1 (${fv1.toDouble()}) @@ -122,20 +90,9 @@ void main() async { n=$tN negate=$negate '''); + } else { continue; } - expect(fxc.fixedPointValue, equals(fx), reason: ''' - $fx (${fx.toDouble()}) - ${fxc.fixedPointValue} (${fxc.fixedPointValue.toDouble()}) - $fv1 (${fv1.toDouble()}) - sEW=$sEW - sMW=$sMW - e1=$e1 - m1=$m1 - m=$tM - n=$tN - negate=$negate -'''); } } } diff --git a/test/arithmetic/values/fixed_point_value_test.dart b/test/arithmetic/values/fixed_point_value_test.dart index bb3dcac00..043391658 100644 --- a/test/arithmetic/values/fixed_point_value_test.dart +++ b/test/arithmetic/values/fixed_point_value_test.dart @@ -182,6 +182,10 @@ void main() { final fxv = FixedPointValue( value: LogicValue.ofInt(i, width), signed: true, m: m, n: n); final dbl = fxv.toDouble(); + if (!FixedPointValue.canStore(dbl, + signed: fxv.signed, m: fxv.m, n: fxv.n)) { + print('error'); + } final fxv2 = FixedPointValue.ofDouble(dbl, signed: true, m: m, n: n); expect(fxv, equals(fxv2)); } From 192cd52c3d9fc55adfe144aef61a0663bd4abf27 Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Fri, 7 Feb 2025 08:55:43 -0800 Subject: [PATCH 4/9] working tests for float-to-fixed lossy, docs --- doc/components/fixed_point.md | 2 + lib/src/arithmetic/fixed_to_float.dart | 2 + lib/src/arithmetic/float_to_fixed.dart | 10 ++-- .../arithmetic/values/fixed_point_value.dart | 15 +++--- test/arithmetic/fixed_to_float_test.dart | 2 - test/arithmetic/float_to_fixed_test.dart | 50 +++++++++++++++++++ 6 files changed, 68 insertions(+), 13 deletions(-) diff --git a/doc/components/fixed_point.md b/doc/components/fixed_point.md index fb21f491c..26cc76147 100644 --- a/doc/components/fixed_point.md +++ b/doc/components/fixed_point.md @@ -18,6 +18,8 @@ This component converts a fixed-point signal to a floating point signal specifie This component converts a floating-point signal to a signed fixed-point signal. Infinities and NaN's are not supported. The integer and fraction widths are auto-calculated to achieve lossles conversion. +If the `m` and `n` integer and fraction widths are supplied, then lossy conversion is performed to fit the floating-point value into the fixed-point value. For testing, [FixedPointValue] has a `canStore` method to predetermine if a double can fit. For execution, [FloatToFixed] can perform overflow detection by setting a `checkOverflow` option. + ## Float8ToFixed This component converts an 8-bit floating-point (FP8) representation ([FloatingPoint8E4M3Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E4M3Value-class.html) or [FloatingPoint8E5M2Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E5M2Value-class.html)) to a signed fixed-point representation. This component offers using the same hardware for both FP8 formats. Therefore, both input and output are of type [Logic](https://intel.github.io/rohd/rohd/Logic-class.html) and can be cast from/to floating point/fixed point by the producer/consumer based on the selected `mode`. Infinities and NaN's are not supported. The output width is 33bits to accomodate [FloatingPoint8E5M2Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E5M2Value-class.html) without loss. diff --git a/lib/src/arithmetic/fixed_to_float.dart b/lib/src/arithmetic/fixed_to_float.dart index bd26006eb..41183ff05 100644 --- a/lib/src/arithmetic/fixed_to_float.dart +++ b/lib/src/arithmetic/fixed_to_float.dart @@ -65,6 +65,8 @@ class FixedToFloat extends Module { .zeroExtend(iWidth) .named('jBit'); + // TODO(desmonddak): refactor to use the roundRNE component + // Extract mantissa final mantissa = Logic(name: 'mantissa', width: mantissaWidth); final guard = Logic(name: 'guardBit'); diff --git a/lib/src/arithmetic/float_to_fixed.dart b/lib/src/arithmetic/float_to_fixed.dart index d83af9516..3d994bc6e 100644 --- a/lib/src/arithmetic/float_to_fixed.dart +++ b/lib/src/arithmetic/float_to_fixed.dart @@ -78,6 +78,10 @@ class FloatToFixed extends Module { mux(jBit, exp, Const(0, width: eWidth)) - Const(noLossN - this.n, width: eWidth); } + // TODO(desmonddak): Could use signed shifter if we unified shift math + final shiftRight = ((fullMantissa.width > outputWidth) + ? (~shift + 1) - (fullMantissa.width - outputWidth) + : (~shift + 1)); if (checkOverflow & ((this.m < noLossM) | (this.n < noLossN))) { final overFlow = Logic(name: 'overflow'); @@ -104,11 +108,7 @@ class FloatToFixed extends Module { final preNumber = (outputWidth >= fullMantissa.width) ? fullMantissa.zeroExtend(outputWidth) : fullMantissa.slice(-1, fullMantissa.width - outputWidth); - // TODO(desmonddak): Rounder is needed when shift is negative, - // LSB(fullMantissa) = shiftRight - final shiftRight = ((fullMantissa.width > outputWidth) - ? (~shift + 1) - (fullMantissa.width - outputWidth) - : (~shift + 1)); + // TODO(desmonddak): Rounder is needed when shifting right final number = mux(shift[-1], preNumber >>> shiftRight, preNumber << shift); diff --git a/lib/src/arithmetic/values/fixed_point_value.dart b/lib/src/arithmetic/values/fixed_point_value.dart index a6787b8b0..5ffd9747b 100644 --- a/lib/src/arithmetic/values/fixed_point_value.dart +++ b/lib/src/arithmetic/values/fixed_point_value.dart @@ -156,12 +156,15 @@ class FixedPointValue implements Comparable { static bool canStore(double val, {required bool signed, required int m, required int n}) { final w = signed ? 1 + m + n : m + n; - final bigIntegerValue = BigInt.from(val * pow(2, n)); - final negBigIntegerValue = BigInt.from(-val * pow(2, n)); - final l = (val < 0.0) - ? max(bigIntegerValue.bitLength, negBigIntegerValue.bitLength) - : bigIntegerValue.bitLength; - return l <= w; + if (val.isFinite) { + final bigIntegerValue = BigInt.from(val * pow(2, n)); + final negBigIntegerValue = BigInt.from(-val * pow(2, n)); + final l = (val < 0.0) + ? max(bigIntegerValue.bitLength, negBigIntegerValue.bitLength) + : bigIntegerValue.bitLength; + return l <= w; + } + return false; } /// Constructs [FixedPointValue] from a Dart [double] rounding away from zero. diff --git a/test/arithmetic/fixed_to_float_test.dart b/test/arithmetic/fixed_to_float_test.dart index 81ab9a8f4..bc5e98e78 100644 --- a/test/arithmetic/fixed_to_float_test.dart +++ b/test/arithmetic/fixed_to_float_test.dart @@ -167,8 +167,6 @@ void main() async { } }); - // TODO(desmonddak): complete this test as now - // FloatingPointValue.ofDouble handles infinities. test('Signed Q7.0 to E3M2', () async { final fixed = FixedPoint(signed: true, m: 7, n: 0); final dut = FixedToFloat(fixed, exponentWidth: 3, mantissaWidth: 2); diff --git a/test/arithmetic/float_to_fixed_test.dart b/test/arithmetic/float_to_fixed_test.dart index 506ca968c..0c6945853 100644 --- a/test/arithmetic/float_to_fixed_test.dart +++ b/test/arithmetic/float_to_fixed_test.dart @@ -234,4 +234,54 @@ void main() async { } } }); + test('FloatToFixed: BF16 singleton', () { + final bf16 = FloatingPointBF16(); + final bf16Val = + FloatingPointBF16Value.ofBinaryStrings('0', '00000000', '0000010'); + bf16.put(bf16Val); + const m = 18; + const n = 16; + final convert = FloatToFixed(bf16, m: m, n: n); + final expectedDbl = bf16Val.toDouble(); + + if (FixedPointValue.canStore(expectedDbl, + signed: true, m: convert.m, n: convert.n)) { + final expected = + FixedPointValue.ofDouble(expectedDbl, signed: true, m: m, n: n); + final fixedVal = convert.fixed; + final computedDbl = fixedVal.fixedPointValue.toDouble(); + final computed = + FixedPointValue.ofDouble(computedDbl, signed: true, m: m, n: n); + expect(expected, equals(computed), reason: ''' + expected=$expected ($expectedDbl) + computed=$computed ($computedDbl) +'''); + } + }); + test('FloatToFixed: BF16', () { + final bf16 = FloatingPointBF16()..put(0); + const m = 18; + const n = 16; + final convert = FloatToFixed(bf16, m: m, n: n); + for (var i = 0; i < pow(2, 16); i++) { + final val = LogicValue.ofInt(i, 16); + final bf16Val = FloatingPointBF16Value.ofLogicValue(val); + bf16.put(bf16Val); + final expectedDbl = bf16Val.toDouble(); + + if (FixedPointValue.canStore(expectedDbl, + signed: true, m: convert.m, n: convert.n)) { + final expected = + FixedPointValue.ofDouble(expectedDbl, signed: true, m: m, n: n); + final fixedVal = convert.fixed; + final computedDbl = fixedVal.fixedPointValue.toDouble(); + final computed = + FixedPointValue.ofDouble(computedDbl, signed: true, m: m, n: n); + expect(expected, equals(computed), reason: ''' + expected=$expected ($expectedDbl) + computed=$computed ($computedDbl) +'''); + } + } + }); } From 2b5971bb6b6616d0a69f687fb295fd6a245bba73 Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Fri, 7 Feb 2025 08:57:40 -0800 Subject: [PATCH 5/9] cleanup of prints --- doc/components/fixed_point.md | 10 +++++----- test/arithmetic/values/fixed_point_value_test.dart | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/components/fixed_point.md b/doc/components/fixed_point.md index 26cc76147..36edac383 100644 --- a/doc/components/fixed_point.md +++ b/doc/components/fixed_point.md @@ -1,6 +1,6 @@ # Fixed-Point Arithmetic -Fixed-point binary representation of numbers is useful several applications including digital signal processing and embedded systems. As a first step towards enabling fixed-point components, we created a new value system [FixedPointValue](https://intel.github.io/rohd-hcl/rohd_hcl/FixedPointValue-class.html) similar to [LogicValue](https://intel.github.io/rohd/rohd/LogicValue-class.html). +Fixed-point binary representation of numbers is useful several applications including digital signal processing and embedded systems. As a first step towards enabling fixed-point components, we created a new value system [FixedPointValue](https://intel.github.io/rohd-hcl/rohd_hcl/FixedPointValue-class.html) similar to [LogicValue](https://intel.github.io/rohd/rohd/LogicValue-class.html). ## FixedPointValue @@ -12,14 +12,14 @@ The [FixedPoint](https://intel.github.io/rohd-hcl/rohd_hcl/FixedPoint-class.html ## FixedToFloat -This component converts a fixed-point signal to a floating point signal specified by exponent and mantissa width. The output is rounded to nearest even when applicable and set to infinity if the input exceed the representable range. +This component converts a fixed-point signal to a floating point signal specified by exponent and mantissa width. The output is rounded to the nearest even (RNE) when applicable and set to infinity if the input exceed the representable range. ## FloatToFixed -This component converts a floating-point signal to a signed fixed-point signal. Infinities and NaN's are not supported. The integer and fraction widths are auto-calculated to achieve lossles conversion. +This component converts a floating-point signal to a signed fixed-point signal. Infinities and NaN's are not supported. The integer and fraction widths are auto-calculated to achieve lossless conversion. -If the `m` and `n` integer and fraction widths are supplied, then lossy conversion is performed to fit the floating-point value into the fixed-point value. For testing, [FixedPointValue] has a `canStore` method to predetermine if a double can fit. For execution, [FloatToFixed] can perform overflow detection by setting a `checkOverflow` option. +If the `m` and `n` integer and fraction widths are supplied, then lossy conversion is performed to fit the floating-point value into the fixed-point value. For testing, [FixedPointValue] has a `canStore` method to predetermine if a given double can fit. For execution, [FloatToFixed] can perform overflow detection by setting a `checkOverflow` option. ## Float8ToFixed -This component converts an 8-bit floating-point (FP8) representation ([FloatingPoint8E4M3Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E4M3Value-class.html) or [FloatingPoint8E5M2Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E5M2Value-class.html)) to a signed fixed-point representation. This component offers using the same hardware for both FP8 formats. Therefore, both input and output are of type [Logic](https://intel.github.io/rohd/rohd/Logic-class.html) and can be cast from/to floating point/fixed point by the producer/consumer based on the selected `mode`. Infinities and NaN's are not supported. The output width is 33bits to accomodate [FloatingPoint8E5M2Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E5M2Value-class.html) without loss. +This component converts an 8-bit floating-point (FP8) representation ([FloatingPoint8E4M3Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E4M3Value-class.html) or [FloatingPoint8E5M2Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E5M2Value-class.html)) to a signed fixed-point representation. This component offers using the same hardware for both FP8 formats. Therefore, both input and output are of type [Logic](https://intel.github.io/rohd/rohd/Logic-class.html) and can be cast from/to floating point/fixed point by the producer/consumer based on the selected `mode`. Infinities and NaN's are not supported. The output width is 33bits to accommodate [FloatingPoint8E5M2Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E5M2Value-class.html) without loss. diff --git a/test/arithmetic/values/fixed_point_value_test.dart b/test/arithmetic/values/fixed_point_value_test.dart index 043391658..ec5444646 100644 --- a/test/arithmetic/values/fixed_point_value_test.dart +++ b/test/arithmetic/values/fixed_point_value_test.dart @@ -184,7 +184,7 @@ void main() { final dbl = fxv.toDouble(); if (!FixedPointValue.canStore(dbl, signed: fxv.signed, m: fxv.m, n: fxv.n)) { - print('error'); + throw RohdHclException('generated a value that we cannot store'); } final fxv2 = FixedPointValue.ofDouble(dbl, signed: true, m: m, n: n); expect(fxv, equals(fxv2)); From 3962f99921852c38929fc70e854b372053adf367 Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Fri, 7 Feb 2025 10:02:54 -0800 Subject: [PATCH 6/9] better internal signal naming --- lib/src/arithmetic/float_to_fixed.dart | 33 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/lib/src/arithmetic/float_to_fixed.dart b/lib/src/arithmetic/float_to_fixed.dart index 3d994bc6e..250554837 100644 --- a/lib/src/arithmetic/float_to_fixed.dart +++ b/lib/src/arithmetic/float_to_fixed.dart @@ -65,31 +65,36 @@ class FloatToFixed extends Module { final eWidth = max(log2Ceil(this.n + this.m), float.exponent.width) + 2; final shift = Logic(name: 'shift', width: eWidth); - final exp = (float.exponent - 1).zeroExtend(eWidth); + final exp = (float.exponent - 1).zeroExtend(eWidth).named('expMinus1'); if (this.n > noLossN) { shift <= mux(jBit, exp, Const(0, width: eWidth)) + - Const(this.n - noLossN, width: eWidth); + Const(this.n - noLossN, width: eWidth).named('deltaN'); } else if (this.n == noLossN) { shift <= mux(jBit, exp, Const(0, width: eWidth)); } else { shift <= mux(jBit, exp, Const(0, width: eWidth)) - - Const(noLossN - this.n, width: eWidth); + Const(noLossN - this.n, width: eWidth).named('deltaN'); } // TODO(desmonddak): Could use signed shifter if we unified shift math final shiftRight = ((fullMantissa.width > outputWidth) - ? (~shift + 1) - (fullMantissa.width - outputWidth) - : (~shift + 1)); + ? (~shift + 1) - (fullMantissa.width - outputWidth) + : (~shift + 1)) + .named('shiftRight'); if (checkOverflow & ((this.m < noLossM) | (this.n < noLossN))) { final overFlow = Logic(name: 'overflow'); - final leadDetect = ParallelPrefixPriorityEncoder(fullMantissa.reversed); + final leadDetect = ParallelPrefixPriorityEncoder(fullMantissa.reversed, + name: 'leadone_detector'); final sWidth = max(eWidth, leadDetect.out.width); - final fShift = shift.zeroExtend(sWidth); - final leadOne = leadDetect.out.zeroExtend(sWidth); + final fShift = shift.zeroExtend(sWidth).named('wideShift'); + final leadOne = leadDetect.out + .named('leadOneRaw') + .zeroExtend(sWidth) + .named('leadOne'); Combinational([ If(jBit, then: [ @@ -105,14 +110,16 @@ class FloatToFixed extends Module { ]); addOutput('overflow') <= overFlow; } - final preNumber = (outputWidth >= fullMantissa.width) - ? fullMantissa.zeroExtend(outputWidth) - : fullMantissa.slice(-1, fullMantissa.width - outputWidth); + final preNumber = ((outputWidth >= fullMantissa.width) + ? fullMantissa.zeroExtend(outputWidth) + : fullMantissa.slice(-1, fullMantissa.width - outputWidth)) + .named('newMantissaPreShift'); // TODO(desmonddak): Rounder is needed when shifting right - final number = mux(shift[-1], preNumber >>> shiftRight, preNumber << shift); + final number = mux(shift[-1], preNumber >>> shiftRight, preNumber << shift) + .named('number'); - _fixed <= mux(float.sign, ~number + 1, number); + _fixed <= mux(float.sign, (~number + 1).named('negNumber'), number); addOutput('fixed', width: outputWidth) <= _fixed; } } From 45f18ae8b5fd1738bac9ecf9b205aa4bdf178fe0 Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Fri, 7 Feb 2025 12:41:05 -0800 Subject: [PATCH 7/9] doc update, checkOverflow as property --- doc/components/fixed_point.md | 4 ++-- lib/src/arithmetic/float_to_fixed.dart | 5 ++++- lib/src/arithmetic/values/fixed_point_value.dart | 9 ++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/components/fixed_point.md b/doc/components/fixed_point.md index 36edac383..6e2e6e973 100644 --- a/doc/components/fixed_point.md +++ b/doc/components/fixed_point.md @@ -12,13 +12,13 @@ The [FixedPoint](https://intel.github.io/rohd-hcl/rohd_hcl/FixedPoint-class.html ## FixedToFloat -This component converts a fixed-point signal to a floating point signal specified by exponent and mantissa width. The output is rounded to the nearest even (RNE) when applicable and set to infinity if the input exceed the representable range. +The [FixedToFloat](https://intel.github.io/rohd-hcl/rohd_hcl/FixedToFloat-class.html) component converts a fixed-point signal to a floating point signal specified by exponent and mantissa width. The output is rounded to the nearest even (RNE) when applicable and set to infinity if the input exceed the representable range. ## FloatToFixed This component converts a floating-point signal to a signed fixed-point signal. Infinities and NaN's are not supported. The integer and fraction widths are auto-calculated to achieve lossless conversion. -If the `m` and `n` integer and fraction widths are supplied, then lossy conversion is performed to fit the floating-point value into the fixed-point value. For testing, [FixedPointValue] has a `canStore` method to predetermine if a given double can fit. For execution, [FloatToFixed] can perform overflow detection by setting a `checkOverflow` option. +If the `m` and `n` integer and fraction widths are supplied, then lossy conversion is performed to fit the floating-point value into the fixed-point value. For testing, [FixedPointValue](https://intel.github.io/rohd-hcl/rohd_hcl/FixedPointValue-class.html) has a `canStore` method to predetermine if a given double can fit. For execution, [FloatToFixed](https://intel.github.io/rohd-hcl/rohd_hcl/FloatToFixed-class.html) can perform overflow detection by setting a `checkOverflow` option, which is a property of the class and set in the constructor (default is false as it must add significant logic to do the check). ## Float8ToFixed diff --git a/lib/src/arithmetic/float_to_fixed.dart b/lib/src/arithmetic/float_to_fixed.dart index 250554837..65079f931 100644 --- a/lib/src/arithmetic/float_to_fixed.dart +++ b/lib/src/arithmetic/float_to_fixed.dart @@ -26,6 +26,9 @@ class FloatToFixed extends Module { /// Width of output fractional part. late final int n; + /// Add overflow checking logic + final bool checkOverflow; + /// Return true if the conversion overflowed. Logic? get overflow => tryOutput('overflow'); @@ -43,7 +46,7 @@ class FloatToFixed extends Module { /// case that loss can occur and an optional output [overflow] will be /// produced that returns true when overflow occurs. FloatToFixed(FloatingPoint float, - {super.name = 'FloatToFixed', int? m, int? n, bool checkOverflow = false}) + {super.name = 'FloatToFixed', int? m, int? n, this.checkOverflow = false}) : super( definitionName: 'FloatE${float.exponent.width}' 'M${float.mantissa.width}ToFixed') { diff --git a/lib/src/arithmetic/values/fixed_point_value.dart b/lib/src/arithmetic/values/fixed_point_value.dart index 5ffd9747b..35f4b01ea 100644 --- a/lib/src/arithmetic/values/fixed_point_value.dart +++ b/lib/src/arithmetic/values/fixed_point_value.dart @@ -146,13 +146,12 @@ class FixedPointValue implements Comparable { /// Return a string representation of FloatingPointValue. /// return sign, exponent, mantissa as binary strings. @override - String toString() => - "(${signed ? '${value[-1].toString(includeWidth: false)} ' : ''}" + String toString() => "(${signed ? '${value[-1].bitString} ' : ''}" "${(m > 0) ? '${value.slice(m + n - 1, n).bitString} ' : ''}" - '${value.slice(n - 1, 0).toString(includeWidth: false)})'; + '${value.slice(n - 1, 0).bitString})'; - /// Return true if double [val] be stored in FixedPointValue with [m] and [n] - /// lengths. + /// Return true if double [val] to be stored in [FixedPointValue] + /// with [m] and [n] lengths without overflowing. static bool canStore(double val, {required bool signed, required int m, required int n}) { final w = signed ? 1 + m + n : m + n; From 7c5fac3981684dfb14a3db93525c05e73a745d9c Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Fri, 7 Feb 2025 12:44:21 -0800 Subject: [PATCH 8/9] typo overflow --- lib/src/arithmetic/float_to_fixed.dart | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/src/arithmetic/float_to_fixed.dart b/lib/src/arithmetic/float_to_fixed.dart index 65079f931..e55d7d841 100644 --- a/lib/src/arithmetic/float_to_fixed.dart +++ b/lib/src/arithmetic/float_to_fixed.dart @@ -88,7 +88,7 @@ class FloatToFixed extends Module { .named('shiftRight'); if (checkOverflow & ((this.m < noLossM) | (this.n < noLossN))) { - final overFlow = Logic(name: 'overflow'); + final overflow = Logic(name: 'overflow'); final leadDetect = ParallelPrefixPriorityEncoder(fullMantissa.reversed, name: 'leadone_detector'); @@ -101,17 +101,17 @@ class FloatToFixed extends Module { Combinational([ If(jBit, then: [ - overFlow < shift.gte(outputWidth - float.mantissa.width - 1), + overflow < shift.gte(outputWidth - float.mantissa.width - 1), ], orElse: [ If(fShift.gt(leadOne), then: [ - overFlow < + overflow < (fShift - leadOne).gte(outputWidth - float.mantissa.width - 1), ], orElse: [ - overFlow < Const(0), + overflow < Const(0), ]), ]), ]); - addOutput('overflow') <= overFlow; + addOutput('overflow') <= overflow; } final preNumber = ((outputWidth >= fullMantissa.width) ? fullMantissa.zeroExtend(outputWidth) From 111bed0d78e803b9975e1f5e42ab42e467f53a59 Mon Sep 17 00:00:00 2001 From: "Desmond A. Kirkpatrick" Date: Mon, 10 Feb 2025 13:09:55 -0800 Subject: [PATCH 9/9] doc update for converter --- doc/components/fixed_point.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/components/fixed_point.md b/doc/components/fixed_point.md index 6e2e6e973..4709becf4 100644 --- a/doc/components/fixed_point.md +++ b/doc/components/fixed_point.md @@ -20,6 +20,8 @@ This component converts a floating-point signal to a signed fixed-point signal. If the `m` and `n` integer and fraction widths are supplied, then lossy conversion is performed to fit the floating-point value into the fixed-point value. For testing, [FixedPointValue](https://intel.github.io/rohd-hcl/rohd_hcl/FixedPointValue-class.html) has a `canStore` method to predetermine if a given double can fit. For execution, [FloatToFixed](https://intel.github.io/rohd-hcl/rohd_hcl/FloatToFixed-class.html) can perform overflow detection by setting a `checkOverflow` option, which is a property of the class and set in the constructor (default is false as it must add significant logic to do the check). +Currently, the FloatToFixed converter, when in lossy mode, is not performing any real rounding (just truncating). + ## Float8ToFixed This component converts an 8-bit floating-point (FP8) representation ([FloatingPoint8E4M3Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E4M3Value-class.html) or [FloatingPoint8E5M2Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E5M2Value-class.html)) to a signed fixed-point representation. This component offers using the same hardware for both FP8 formats. Therefore, both input and output are of type [Logic](https://intel.github.io/rohd/rohd/Logic-class.html) and can be cast from/to floating point/fixed point by the producer/consumer based on the selected `mode`. Infinities and NaN's are not supported. The output width is 33bits to accommodate [FloatingPoint8E5M2Value](https://intel.github.io/rohd-hcl/rohd_hcl/FloatingPoint8E5M2Value-class.html) without loss.