Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22834 MF2: make tests compliant with schema and update spec tests #3063

Merged
merged 1 commit into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion icu4c/source/common/unicode/utypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -584,12 +584,13 @@ typedef enum UErrorCode {
U_MF_OPERAND_MISMATCH_ERROR, /**< An operand provided to a function does not have the required form for that function @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_UNSUPPORTED_STATEMENT_ERROR, /**< A message includes a reserved statement. @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_UNSUPPORTED_EXPRESSION_ERROR, /**< A message includes syntax reserved for future standardization or private implementation use. @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_DUPLICATE_VARIANT_ERROR, /**< A message includes a variant with the same key list as another variant. @internal ICU 76 technology preview @deprecated This API is for technology preview only. */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal formatting API error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_FMT_PARSE_ERROR_LIMIT = 0x10121,
U_FMT_PARSE_ERROR_LIMIT = 0x10122,
#endif // U_HIDE_DEPRECATED_API

/*
Expand Down
3 changes: 2 additions & 1 deletion icu4c/source/common/utypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ _uFmtErrorName[U_FMT_PARSE_ERROR_LIMIT - U_FMT_PARSE_ERROR_START] = {
"U_MF_DUPLICATE_DECLARATION_ERROR",
"U_MF_OPERAND_MISMATCH_ERROR",
"U_MF_UNSUPPORTED_STATEMENT_ERROR",
"U_MF_UNSUPPORTED_EXPRESSION_ERROR"
"U_MF_UNSUPPORTED_EXPRESSION_ERROR",
"U_MF_DUPLICATE_VARIANT_ERROR"
};

static const char * const
Expand Down
29 changes: 28 additions & 1 deletion icu4c/source/i18n/messageformat2_checker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Checks data model errors
The following are checked here:
Variant Key Mismatch
Duplicate Variant
Missing Fallback Variant (called NonexhaustivePattern here)
Missing Selector Annotation
Duplicate Declaration
Expand Down Expand Up @@ -162,6 +163,7 @@ void Checker::checkVariants(UErrorCode& status) {

// Check that one variant includes only wildcards
bool defaultExists = false;
bool duplicatesExist = false;

for (int32_t i = 0; i < dataModel.numVariants(); i++) {
const SelectorKeys& k = variants[i].getKeys();
Expand All @@ -173,10 +175,35 @@ void Checker::checkVariants(UErrorCode& status) {
return;
}
defaultExists |= areDefaultKeys(keys, len);

// Check if this variant's keys are duplicated by any other variant's keys
if (!duplicatesExist) {
// This check takes quadratic time, but it can be optimized if checking
// this property turns out to be a bottleneck.
for (int32_t j = 0; j < i; j++) {
const SelectorKeys& k1 = variants[j].getKeys();
const Key* keys1 = k1.getKeysInternal();
bool allEqual = true;
// This variant was already checked,
// so we know keys1.len == len
for (int32_t kk = 0; kk < len; kk++) {
if (!(keys[kk] == keys1[kk])) {
allEqual = false;
break;
}
}
if (allEqual) {
duplicatesExist = true;
}
}
}
}

if (duplicatesExist) {
errors.addError(StaticErrorType::DuplicateVariant, status);
}
if (!defaultExists) {
errors.addError(StaticErrorType::NonexhaustivePattern, status);
return;
}
}

Expand Down
34 changes: 17 additions & 17 deletions icu4c/source/i18n/messageformat2_data_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@ bool Key::operator==(const Key& other) const {
if (isWildcard()) {
return other.isWildcard();
}
if (other.isWildcard()) {
return false;
}
return (asLiteral() == other.asLiteral());
}

Expand Down Expand Up @@ -829,31 +832,28 @@ const Expression& Binding::getValue() const {
} else {
const Operator* rator = rhs.getOperator(errorCode);
bool hasOperator = U_SUCCESS(errorCode);
if (hasOperator && rator->isReserved()) {
errorCode = U_INVALID_STATE_ERROR;
// Clear error code -- the "error" from the absent operator
// is handled
errorCode = U_ZERO_ERROR;
b = Binding(variableName, std::move(rhs));
b.local = false;
if (hasOperator) {
rator = b.getValue().getOperator(errorCode);
U_ASSERT(U_SUCCESS(errorCode));
b.annotation = &rator->contents;
} else {
// Clear error code -- the "error" from the absent operator
// is handled
errorCode = U_ZERO_ERROR;
b = Binding(variableName, std::move(rhs));
b.local = false;
if (hasOperator) {
rator = b.getValue().getOperator(errorCode);
U_ASSERT(U_SUCCESS(errorCode));
b.annotation = std::get_if<Callable>(&(rator->contents));
} else {
b.annotation = nullptr;
}
U_ASSERT(!hasOperator || b.annotation != nullptr);
b.annotation = nullptr;
}
U_ASSERT(!hasOperator || b.annotation != nullptr);
}
}
return b;
}

const OptionMap& Binding::getOptionsInternal() const {
U_ASSERT(annotation != nullptr);
return annotation->getOptions();
U_ASSERT(std::holds_alternative<Callable>(*annotation));
return std::get_if<Callable>(annotation)->getOptions();
}

void Binding::updateAnnotation() {
Expand All @@ -863,7 +863,7 @@ void Binding::updateAnnotation() {
return;
}
U_ASSERT(U_SUCCESS(localErrorCode) && !rator->isReserved());
annotation = std::get_if<Callable>(&(rator->contents));
annotation = &rator->contents;
}

Binding::Binding(const Binding& other) : var(other.var), expr(other.expr), local(other.local) {
Expand Down
8 changes: 8 additions & 0 deletions icu4c/source/i18n/messageformat2_errors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ namespace message2 {
status = U_MF_VARIANT_KEY_MISMATCH_ERROR;
break;
}
case StaticErrorType::DuplicateVariant: {
status = U_MF_DUPLICATE_VARIANT_ERROR;
break;
}
case StaticErrorType::NonexhaustivePattern: {
status = U_MF_NONEXHAUSTIVE_PATTERN_ERROR;
break;
Expand Down Expand Up @@ -211,6 +215,10 @@ namespace message2 {
dataModelError = true;
break;
}
case StaticErrorType::DuplicateVariant: {
dataModelError = true;
break;
}
case StaticErrorType::NonexhaustivePattern: {
dataModelError = true;
break;
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/i18n/messageformat2_errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ namespace message2 {
enum StaticErrorType {
DuplicateDeclarationError,
DuplicateOptionName,
DuplicateVariant,
MissingSelectorAnnotation,
NonexhaustivePattern,
SyntaxError,
Expand Down
41 changes: 35 additions & 6 deletions icu4c/source/i18n/messageformat2_function_registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@

#if !UCONFIG_NO_MF2

#include <math.h>

#include "unicode/dtptngen.h"
#include "unicode/messageformat2_data_model_names.h"
#include "unicode/messageformat2_function_registry.h"
#include "unicode/smpdtfmt.h"
#include "charstr.h"
#include "double-conversion.h"
#include "messageformat2_allocation.h"
#include "messageformat2_function_registry_internal.h"
#include "messageformat2_macros.h"
Expand Down Expand Up @@ -421,25 +424,51 @@ static FormattedPlaceholder notANumber(const FormattedPlaceholder& input) {
return FormattedPlaceholder(input, FormattedValue(UnicodeString("NaN")));
}

static FormattedPlaceholder stringAsNumber(const number::LocalizedNumberFormatter& nf, const FormattedPlaceholder& input, UErrorCode& errorCode) {
static double parseNumberLiteral(const FormattedPlaceholder& input, UErrorCode& errorCode) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestions from @sffc

  1. Use function from double-conversion.h
  2. Use function from number_decimalquantity.h
  3. Use function from util.h called parseNumber to replace the helper methods that parse the integer and fractional portions.

Either option 1 or 2 should be sufficient to do what we need here. Option 1 gives a double, option 2 gives a DecimalQuantity. At least option 1 should be able to handle sign, decimal point, and exponent, and probably option 2. Option 3 is helpful but doesn't replace as much code as the other 2 options.

Copy link
Contributor Author

@catamorphism catamorphism Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried both (1) and (2), and the problem is that there are some strings that both functions accept, which are not valid number literals according to the MF2 grammar: for example, 01 (leading zero with no decimal point), +1 (leading '+'), .1 (decimal point with no leading 0), and 1. (trailing decimal point).

Since these cases are relatively simple to check for, I just added a check before calling into StringToDouble (option 1) - see 176a232.

I didn't like (3) since calling parseNumber to parse the decimal part still means having to do math to add the two parts together, so it doesn't seem to simplify the code that much.

Let me know what you think.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks great! Option 1 seemed the most likely & directly applicable.

if (U_FAILURE(errorCode)) {
return {};
}

double numberValue;
// Copying string to avoid GCC dangling-reference warning
// (although the reference is safe)
UnicodeString inputStr = input.asFormattable().getString(errorCode);
// Precondition: `input`'s source Formattable has type string
if (U_FAILURE(errorCode)) {
return {};
}
UErrorCode localErrorCode = U_ZERO_ERROR;
strToDouble(inputStr, numberValue, localErrorCode);
if (U_FAILURE(localErrorCode)) {

// Hack: Check for cases that are forbidden by the MF2 grammar
// but allowed by StringToDouble
int32_t len = inputStr.length();

if (len > 0 && ((inputStr[0] == '+')
|| (inputStr[0] == '0' && len > 1 && inputStr[1] != '.')
|| (inputStr[len - 1] == '.')
|| (inputStr[0] == '.'))) {
errorCode = U_MF_OPERAND_MISMATCH_ERROR;
return 0;
}

// Otherwise, convert to double using double_conversion::StringToDoubleConverter
using namespace double_conversion;
int processedCharactersCount = 0;
StringToDoubleConverter converter(0, 0, 0, "", "");
double result =
converter.StringToDouble(reinterpret_cast<const uint16_t*>(inputStr.getBuffer()),
len,
&processedCharactersCount);
if (processedCharactersCount != len) {
errorCode = U_MF_OPERAND_MISMATCH_ERROR;
}
return result;
}

static FormattedPlaceholder tryParsingNumberLiteral(const number::LocalizedNumberFormatter& nf, const FormattedPlaceholder& input, UErrorCode& errorCode) {
double numberValue = parseNumberLiteral(input, errorCode);
if (U_FAILURE(errorCode)) {
return notANumber(input);
}

UErrorCode savedStatus = errorCode;
number::FormattedNumber result = nf.formatDouble(numberValue, errorCode);
// Ignore U_USING_DEFAULT_WARNING
Expand Down Expand Up @@ -590,7 +619,7 @@ FormattedPlaceholder StandardFunctions::Number::format(FormattedPlaceholder&& ar
}
case UFMT_STRING: {
// Try to parse the string as a number
return stringAsNumber(realFormatter, arg, errorCode);
return tryParsingNumberLiteral(realFormatter, arg, errorCode);
}
default: {
// Other types can't be parsed as a number
Expand Down
16 changes: 4 additions & 12 deletions icu4c/source/i18n/messageformat2_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,19 +60,11 @@ using namespace pluralimpl;
// Fallback
#define REPLACEMENT ((UChar32) 0xFFFD)

// MessageFormat2 uses four keywords: `.input`, `.local`, `.when`, and `.match`.
// MessageFormat2 uses three keywords: `.input`, `.local`, and `.match`.

static constexpr UChar32 ID_INPUT[] = {
0x2E, 0x69, 0x6E, 0x70, 0x75, 0x74, 0 /* ".input" */
};

static constexpr UChar32 ID_LOCAL[] = {
0x2E, 0x6C, 0x6F, 0x63, 0x61, 0x6C, 0 /* ".local" */
};

static constexpr UChar32 ID_MATCH[] = {
0x2E, 0x6D, 0x61, 0x74, 0x63, 0x68, 0 /* ".match" */
};
static constexpr std::u16string_view ID_INPUT = u".input";
static constexpr std::u16string_view ID_LOCAL = u".local";
static constexpr std::u16string_view ID_MATCH = u".match";

// Returns immediately if `errorCode` indicates failure
#define CHECK_ERROR(errorCode) \
Expand Down
Loading