From 7d2adcc22e87ca078e449c70ca629edfdbdce21f Mon Sep 17 00:00:00 2001 From: Younies Mahmoud Date: Mon, 20 Jan 2025 20:45:29 +0100 Subject: [PATCH] first implementation --- icu4c/source/i18n/measunit.cpp | 1 + icu4c/source/i18n/measunit_extra.cpp | 432 +++++++++++++++++---- icu4c/source/i18n/measunit_impl.h | 8 + icu4c/source/i18n/number_fluent.cpp | 3 - icu4c/source/test/intltest/measfmttest.cpp | 2 +- icu4c/source/test/intltest/units_test.cpp | 33 ++ 6 files changed, 392 insertions(+), 87 deletions(-) diff --git a/icu4c/source/i18n/measunit.cpp b/icu4c/source/i18n/measunit.cpp index 2741b84aabf0..75fb64711bb2 100644 --- a/icu4c/source/i18n/measunit.cpp +++ b/icu4c/source/i18n/measunit.cpp @@ -2400,6 +2400,7 @@ MeasureUnitImpl MeasureUnitImpl::copy(UErrorCode &status) const { MeasureUnitImpl result; result.complexity = complexity; result.identifier.append(identifier, status); + result.constantDenominator = constantDenominator; for (int32_t i = 0; i < singleUnits.length(); i++) { SingleUnitImpl *item = result.singleUnits.emplaceBack(*singleUnits[i]); if (!item) { diff --git a/icu4c/source/i18n/measunit_extra.cpp b/icu4c/source/i18n/measunit_extra.cpp index a6348422738b..f1d01af4ecc8 100644 --- a/icu4c/source/i18n/measunit_extra.cpp +++ b/icu4c/source/i18n/measunit_extra.cpp @@ -467,37 +467,55 @@ void U_CALLCONV initUnitExtras(UErrorCode& status) { class Token { public: - Token(int32_t match) : fMatch(match) {} - - enum Type { - TYPE_UNDEFINED, - TYPE_PREFIX, - // Token type for "-per-", "-", and "-and-". - TYPE_COMPOUND_PART, - // Token type for "per-". - TYPE_INITIAL_COMPOUND_PART, - TYPE_POWER_PART, - TYPE_SIMPLE_UNIT, - }; - - // Calling getType() is invalid, resulting in an assertion failure, if Token - // value isn't positive. - Type getType() const { - U_ASSERT(fMatch > 0); - if (fMatch < kCompoundPartOffset) { - return TYPE_PREFIX; - } - if (fMatch < kInitialCompoundPartOffset) { - return TYPE_COMPOUND_PART; - } - if (fMatch < kPowerPartOffset) { - return TYPE_INITIAL_COMPOUND_PART; - } - if (fMatch < kSimpleUnitOffset) { - return TYPE_POWER_PART; - } - return TYPE_SIMPLE_UNIT; - } + Token(int64_t match) : fMatch(match) { + U_ASSERT(fMatch > 0); + if (fMatch < kCompoundPartOffset) { + this->fType = TYPE_PREFIX; + } else if (fMatch < kInitialCompoundPartOffset) { + this->fType = TYPE_COMPOUND_PART; + } else if (fMatch < kPowerPartOffset) { + this->fType = TYPE_INITIAL_COMPOUND_PART; + } else if (fMatch < kSimpleUnitOffset) { + this->fType = TYPE_POWER_PART; + } + this->fType = TYPE_SIMPLE_UNIT; + } + + static Token constantToken(StringPiece str, UErrorCode &status) { + Token result; + auto value = result.parseStrigToLong(str, status); + if (U_FAILURE(status)) { + return result; + } + result.fMatch = value; + result.fType = TYPE_CONSTANT_DENOMINATOR; + return result; + } + + enum Type { + TYPE_UNDEFINED, + TYPE_PREFIX, + // Token type for "-per-", "-", and "-and-". + TYPE_COMPOUND_PART, + // Token type for "per-". + TYPE_INITIAL_COMPOUND_PART, + TYPE_POWER_PART, + TYPE_SIMPLE_UNIT, + TYPE_CONSTANT_DENOMINATOR, + }; + + // Calling getType() is invalid, resulting in an assertion failure, if Token + // value isn't positive. + Type getType() const { + U_ASSERT(fMatch > 0); + return this->fType; + } + + // Retrieve the value of the constant denominator if the token is of type TYPE_CONSTANT_DENOMINATOR. + uint64_t getConstantDenominator() const { + U_ASSERT(getType() == TYPE_CONSTANT_DENOMINATOR); + return static_cast(fMatch); + } UMeasurePrefix getUnitPrefix() const { U_ASSERT(getType() == TYPE_PREFIX); @@ -530,8 +548,113 @@ class Token { return fMatch - kSimpleUnitOffset; } + // TODO: Consider split function as a utility function. + // Parse the given string to a unsigned long value. + // If the value is not positive integer, it will return `kUnitIdentifierSyntaxError`. + uint64_t parseStrigToLong(const StringPiece str, UErrorCode &status) { + uint64_t result = 0; + int64_t max_int64 = ~((int64_t)1 << 63); + + // Check for empty string + if (str.empty()) { + status = kUnitIdentifierSyntaxError; + return result; + } + + int32_t exponent = 0; + int32_t exponentIndex = str.length(); + + // Iterate through the string + for (int32_t i = 0; i < str.length(); ++i) { + char c = str.data()[i]; + + // Handle sign + if (i == 0 && c == '+') { + continue; // Skip leading plus sign + } + if (i == 0 && c == '-') { + status = kUnitIdentifierSyntaxError; + return result; + } + + // Handle digits + if (c >= '0' && c <= '9') { + uint16_t digit = c - '0'; + if (result > (std::numeric_limits::max() - digit) / 10) { + throw std::out_of_range("Input value is out of range for unsigned long."); + } + result = result * 10 + digit; + + // Check if the result is within the valid range (0 to int64_t::MAX) + if (result > max_int64) { + status = kUnitIdentifierSyntaxError; + return result; + } + continue; + } + + // Handle 'e' or 'E' + if (c == 'e' || c == 'E') { + exponentIndex = i + 1; + break; + } + + // Invalid character + status = kUnitIdentifierSyntaxError; + return result; + } + + // Handle exponent + for (int i = exponentIndex; i < str.length(); ++i) { + char c = str.data()[i]; + + // handle sign + if (i == exponentIndex && c == '+') { + continue; // Skip leading plus sign + } + + if (i == exponentIndex && c == '-') { + // Negative sign is not allowed for the exponent. + status = kUnitIdentifierSyntaxError; + return result; + } + + // Handle digits + if (c >= '0' && c <= '9') { + uint16_t digit = c - '0'; + exponent = exponent * 10 + digit; + + // Check if the exponent is within the valid range (0 to int32_t::MAX) + if (exponent > 18) { + status = kUnitIdentifierSyntaxError; + return result; + } + + continue; + } + + // Invalid character + status = kUnitIdentifierSyntaxError; + return result; + } + + // Apply the exponent + for (size_t i = 0; i < exponent; ++i) { + result *= 10; + + if (result > max_int64) { + status = kUnitIdentifierSyntaxError; + return result; + } + } + + return result; + } + private: - int32_t fMatch; + Token() = default; + int64_t fMatch; + Type fType = TYPE_UNDEFINED; }; class Parser { @@ -555,6 +678,50 @@ class Parser { return {source}; } + /** + * A single unit or a constant denominator. + */ + struct SingleUnitOrConstant { + enum ValueType { + kSingleUnit, + kConstantDenominator, + }; + + ValueType type = kSingleUnit; + SingleUnitImpl singleUnit; + uint64_t constantDenominator; + + static SingleUnitOrConstant singleUnitValue(SingleUnitImpl singleUnit) { + SingleUnitOrConstant result; + result.type = kSingleUnit; + result.singleUnit = singleUnit; + result.constantDenominator = 0; + return result; + } + + static SingleUnitOrConstant constantDenominatorValue(uint64_t constant) { + SingleUnitOrConstant result; + result.type = kConstantDenominator; + result.singleUnit = {}; + result.constantDenominator = constant; + return result; + } + + uint16_t getConstantDenominator() const { + U_ASSERT(type == kConstantDenominator); + return static_cast(constantDenominator); + } + + SingleUnitImpl getSingleUnit() const { + U_ASSERT(type == kSingleUnit); + return singleUnit; + } + + bool isSingleUnit() const { return type == kSingleUnit; } + + bool isConstantDenominator() const { return type == kConstantDenominator; } + }; + MeasureUnitImpl parse(UErrorCode& status) { MeasureUnitImpl result; @@ -569,12 +736,20 @@ class Parser { while (hasNext()) { bool sawAnd = false; - SingleUnitImpl singleUnit = nextSingleUnit(sawAnd, status); + auto singleUnitOrConstant = nextSingleUnitOrConstant(sawAnd, status); if (U_FAILURE(status)) { return result; } - bool added = result.appendSingleUnit(singleUnit, status); + if (singleUnitOrConstant.isConstantDenominator()) { + result.constantDenominator = singleUnitOrConstant.getConstantDenominator(); + continue; + } + + U_ASSERT(singleUnitOrConstant.isSingleUnit()); + SingleUnitImpl singleUnit = singleUnitOrConstant.getSingleUnit(); + + bool added = result.appendSingleUnit(singleUnitOrConstant.singleUnit, status); if (U_FAILURE(status)) { return result; } @@ -604,6 +779,12 @@ class Parser { } } + if (result.singleUnits.length() == 0) { + // The identifier was empty or only had a constant denominator. + status = kUnitIdentifierSyntaxError; + return result; // add it for code consistency. + } + return result; } @@ -622,6 +803,10 @@ class Parser { // identifier is invalid pending TODO(CLDR-13701). bool fAfterPer = false; + // Set to true when we've just seen a "per-". This is used to determine if + // the next token can be a constant denominator token. + bool fJustSawPer = false; + Parser() : fSource(""), fTrie(u"") {} Parser(StringPiece source) @@ -640,6 +825,10 @@ class Parser { // Saves the position in the fSource string for the end of the most // recent matching token. int32_t previ = -1; + + // Saves the position in the fSource string for later use in case of unit constant found. + int32_t currentFIndex = fIndex; + // Find the longest token that matches a value in the trie: while (fIndex < fSource.length()) { auto result = fTrie.next(fSource.data()[fIndex++]); @@ -658,12 +847,33 @@ class Parser { // continue; } - if (match < 0) { - status = kUnitIdentifierSyntaxError; - } else { + if (match >= 0) { fIndex = previ; + return {match}; + } + + // the index of the character after the last character of the constant denominator. + int32_t endOfConstantIndex = -1; + // If no match was found, we check if the token is a constant denominator. + // 1. find the first `-` from the `currentFIndex` to the end. + for (int32_t i = currentFIndex; i < fSource.length(); ++i) { + if (fSource.data()[i] == '-') { + endOfConstantIndex = i; + break; + } + } + if (endOfConstantIndex == -1) { + endOfConstantIndex = fSource.length(); + } + if (endOfConstantIndex <= currentFIndex) { + status = kUnitIdentifierSyntaxError; + return {match}; } - return {match}; + + StringPiece constantDenominatorStr = + fSource.substr(currentFIndex, endOfConstantIndex - currentFIndex); + fIndex = endOfConstantIndex; + return Token::constantToken(constantDenominatorStr, status); } /** @@ -680,10 +890,10 @@ class Parser { * unit", sawAnd is set to true. If not, it is left as is. * @param status ICU error code. */ - SingleUnitImpl nextSingleUnit(bool &sawAnd, UErrorCode &status) { - SingleUnitImpl result; + SingleUnitOrConstant nextSingleUnitOrConstant(bool &sawAnd, UErrorCode &status) { + SingleUnitImpl singleUnitResult; if (U_FAILURE(status)) { - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } // state: @@ -695,19 +905,22 @@ class Parser { bool atStart = fIndex == 0; Token token = nextToken(status); if (U_FAILURE(status)) { - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } + fJustSawPer = false; + if (atStart) { // Identifiers optionally start with "per-". if (token.getType() == Token::TYPE_INITIAL_COMPOUND_PART) { U_ASSERT(token.getInitialCompoundPart() == INITIAL_COMPOUND_PART_PER); fAfterPer = true; - result.dimensionality = -1; + fJustSawPer = true; + singleUnitResult.dimensionality = -1; token = nextToken(status); if (U_FAILURE(status)) { - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } } } else { @@ -715,7 +928,7 @@ class Parser { // via a compound part: if (token.getType() != Token::TYPE_COMPOUND_PART) { status = kUnitIdentifierSyntaxError; - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } switch (token.getMatch()) { @@ -724,15 +937,16 @@ class Parser { // Mixed compound units not yet supported, // TODO(CLDR-13701). status = kUnitIdentifierSyntaxError; - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } fAfterPer = true; - result.dimensionality = -1; + fJustSawPer = true; + singleUnitResult.dimensionality = -1; break; case COMPOUND_PART_TIMES: if (fAfterPer) { - result.dimensionality = -1; + singleUnitResult.dimensionality = -1; } break; @@ -741,7 +955,7 @@ class Parser { // Can't start with "-and-", and mixed compound units // not yet supported, TODO(CLDR-13701). status = kUnitIdentifierSyntaxError; - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } sawAnd = true; break; @@ -749,52 +963,61 @@ class Parser { token = nextToken(status); if (U_FAILURE(status)) { - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } } + if (token.getType() == Token::TYPE_CONSTANT_DENOMINATOR) { + if (!fJustSawPer) { + status = kUnitIdentifierSyntaxError; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); + } + + return SingleUnitOrConstant::constantDenominatorValue(token.getConstantDenominator()); + } + // Read tokens until we have a complete SingleUnit or we reach the end. while (true) { switch (token.getType()) { - case Token::TYPE_POWER_PART: - if (state > 0) { - status = kUnitIdentifierSyntaxError; - return result; - } - result.dimensionality *= token.getPower(); - state = 1; - break; - - case Token::TYPE_PREFIX: - if (state > 1) { - status = kUnitIdentifierSyntaxError; - return result; - } - result.unitPrefix = token.getUnitPrefix(); - state = 2; - break; - - case Token::TYPE_SIMPLE_UNIT: - result.index = token.getSimpleUnitIndex(); - return result; + case Token::TYPE_POWER_PART: + if (state > 0) { + status = kUnitIdentifierSyntaxError; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); + } + singleUnitResult.dimensionality *= token.getPower(); + state = 1; + break; - default: + case Token::TYPE_PREFIX: + if (state > 1) { status = kUnitIdentifierSyntaxError; - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); + } + singleUnitResult.unitPrefix = token.getUnitPrefix(); + state = 2; + break; + + case Token::TYPE_SIMPLE_UNIT: + singleUnitResult.index = token.getSimpleUnitIndex(); + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); + + default: + status = kUnitIdentifierSyntaxError; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } if (!hasNext()) { // We ran out of tokens before finding a complete single unit. status = kUnitIdentifierSyntaxError; - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } token = nextToken(status); if (U_FAILURE(status)) { - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } } - return result; + return SingleUnitOrConstant::singleUnitValue(singleUnitResult); } }; @@ -1168,10 +1391,14 @@ void MeasureUnitImpl::serialize(UErrorCode &status) { } else { result.append(StringPiece("-per-"), status); } - } else { - if (result.length() != 0) { + + if (this->constantDenominator != 0) { + result.appendNumber(this->constantDenominator, status); result.append(StringPiece("-"), status); } + + } else if (result.length() != 0) { + result.append(StringPiece("-"), status); } } @@ -1181,30 +1408,69 @@ void MeasureUnitImpl::serialize(UErrorCode &status) { this->identifier = CharString(result, status); } -MeasureUnit MeasureUnitImpl::build(UErrorCode& status) && { +MeasureUnit MeasureUnitImpl::build(UErrorCode &status) && { this->serialize(status); return MeasureUnit(std::move(*this)); } -MeasureUnit MeasureUnit::forIdentifier(StringPiece identifier, UErrorCode& status) { +MeasureUnit MeasureUnit::forIdentifier(StringPiece identifier, UErrorCode &status) { return Parser::from(identifier, status).parse(status).build(status); } -UMeasureUnitComplexity MeasureUnit::getComplexity(UErrorCode& status) const { +UMeasureUnitComplexity MeasureUnit::getComplexity(UErrorCode &status) const { MeasureUnitImpl temp; return MeasureUnitImpl::forMeasureUnit(*this, temp, status).complexity; } -UMeasurePrefix MeasureUnit::getPrefix(UErrorCode& status) const { +UMeasurePrefix MeasureUnit::getPrefix(UErrorCode &status) const { return SingleUnitImpl::forMeasureUnit(*this, status).unitPrefix; } -MeasureUnit MeasureUnit::withPrefix(UMeasurePrefix prefix, UErrorCode& status) const UPRV_NO_SANITIZE_UNDEFINED { +MeasureUnit MeasureUnit::withPrefix(UMeasurePrefix prefix, + UErrorCode &status) const UPRV_NO_SANITIZE_UNDEFINED { SingleUnitImpl singleUnit = SingleUnitImpl::forMeasureUnit(*this, status); singleUnit.unitPrefix = prefix; return singleUnit.build(status); } +int64_t MeasureUnit::getConstantDenominator(UErrorCode &status) const { + auto complexity = this->getComplexity(status); + if (U_FAILURE(status)) { + return 0; + } + + if (complexity != UMEASURE_UNIT_SINGLE && complexity != UMEASURE_UNIT_COMPOUND) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if (this->fImpl == nullptr) { + return 0; + } + + return this->fImpl->constantDenominator; +} + +MeasureUnit MeasureUnit::withConstantDenominator(int64_t denominator, UErrorCode &status) const { + auto complexity = this->getComplexity(status); + if (U_FAILURE(status)) { + return {}; + } + if (complexity != UMEASURE_UNIT_SINGLE && complexity != UMEASURE_UNIT_COMPOUND) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return {}; + } + + MeasureUnitImpl impl = MeasureUnitImpl::forMeasureUnitMaybeCopy(*this, status); + if (U_FAILURE(status)) { + return {}; + } + + impl.constantDenominator = denominator; + impl.complexity = UMEASURE_UNIT_COMPOUND; + return std::move(impl).build(status); +} + int32_t MeasureUnit::getDimensionality(UErrorCode& status) const { SingleUnitImpl singleUnit = SingleUnitImpl::forMeasureUnit(*this, status); if (U_FAILURE(status)) { return 0; } diff --git a/icu4c/source/i18n/measunit_impl.h b/icu4c/source/i18n/measunit_impl.h index f6a8f90dc94f..5779020f5bef 100644 --- a/icu4c/source/i18n/measunit_impl.h +++ b/icu4c/source/i18n/measunit_impl.h @@ -328,6 +328,14 @@ class U_I18N_API MeasureUnitImpl : public UMemory { */ CharString identifier; + /** + * Represents the unit constant denominator. + * + * NOTE: + * if set to 0, it means that the constant is not set. + */ + uint64_t constantDenominator = 0; + // For calling serialize // TODO(icu-units#147): revisit serialization friend class number::impl::LongNameHandler; diff --git a/icu4c/source/i18n/number_fluent.cpp b/icu4c/source/i18n/number_fluent.cpp index 0ce01c854cae..5f2f740df71b 100644 --- a/icu4c/source/i18n/number_fluent.cpp +++ b/icu4c/source/i18n/number_fluent.cpp @@ -654,9 +654,6 @@ void LocalizedNumberFormatter::formatImpl(impl::UFormattedNumberData* results, U } else { NumberFormatterImpl::formatStatic(fMacros, results, status); } - if (U_FAILURE(status)) { - return; - } results->getStringRef().writeTerminator(status); } diff --git a/icu4c/source/test/intltest/measfmttest.cpp b/icu4c/source/test/intltest/measfmttest.cpp index fbe71bdf8873..9c24ff34f397 100644 --- a/icu4c/source/test/intltest/measfmttest.cpp +++ b/icu4c/source/test/intltest/measfmttest.cpp @@ -5450,7 +5450,7 @@ void MeasureFormatTest::TestUnitPerUnitResolution() { actual, pos, status); - assertEquals("", "50 psi", actual); + assertEquals("TestUnitPerUnitResolution", "50 psi", actual); } void MeasureFormatTest::TestIndividualPluralFallback() { diff --git a/icu4c/source/test/intltest/units_test.cpp b/icu4c/source/test/intltest/units_test.cpp index add612c27678..88c6a8711924 100644 --- a/icu4c/source/test/intltest/units_test.cpp +++ b/icu4c/source/test/intltest/units_test.cpp @@ -50,6 +50,7 @@ class UnitsTest : public IntlTest { void testComplexUnitsConverter(); void testComplexUnitsConverterSorting(); void testUnitPreferencesWithCLDRTests(); + void testUnitsConstantsDenomenator(); void testConverter(); }; @@ -67,6 +68,7 @@ void UnitsTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha TESTCASE_AUTO(testComplexUnitsConverter); TESTCASE_AUTO(testComplexUnitsConverterSorting); TESTCASE_AUTO(testUnitPreferencesWithCLDRTests); + TESTCASE_AUTO(testUnitsConstantsDenomenator); TESTCASE_AUTO(testConverter); TESTCASE_AUTO_END; } @@ -1157,4 +1159,35 @@ void UnitsTest::testUnitPreferencesWithCLDRTests() { } } +void UnitsTest::testUnitsConstantsDenomenator() { + IcuTestErrorCode status(*this, "UnitTests::testUnitsConstantsDenomenator"); + + // // Test Cases + // struct TestCase { + // const char *source; + // const uint64_t expectedConstant; + // } testCases[]{ + // {"meter-per-100", 1000}, + // }; + + // for (const auto &testCase : testCases) { + // MeasureUnit unit = MeasureUnit::forIdentifier(testCase.source, status); + // if (status.errIfFailureAndReset("forIdentifier(\"%s\")", testCase.source)) { + // continue; + // } + + // uint64_t constant = unit.getConstantDenominator(status); + // if (status.errIfFailureAndReset("getConstantDenominator(\"%s\")", testCase.source)) { + // continue; + // } + + // if (constant != testCase.expectedConstant) { + // status.set(U_ILLEGAL_ARGUMENT_ERROR); + // if (status.errIfFailureAndReset("getConstantDenominator(\"%s\")", testCase.source)) { + // continue; + // } + // } + // } +} + #endif /* #if !UCONFIG_NO_FORMATTING */