Skip to content

Commit

Permalink
ICU-22785 move cptrie bit setter to toolutil; add getCPTrieSize()
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Jun 5, 2024
1 parent c439dcd commit 47e9389
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 41 deletions.
13 changes: 13 additions & 0 deletions icu4c/source/common/uprops.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,19 @@ namespace {
// 0: Script=bits 9..0
// 9.. 0 UScriptCode, or index to Script_Extensions

// *Note*: If we need more than the available bits for new properties,
// then we could move the Age property out of the properties vectors.
// For example, we could store the Age property in its own trie.
// In a small, 8-bit-value-width CodePointTrie, it would be larger than
// the amount of data that we would save in the properties vectors and their trie,
// but the size increase would be a small percentage of the total uprops.icu size.
// It would certainly be a much smaller increase than widening the properties vectors.
// The savings in the properties vectors+trie from pulling out the Age property
// are partly from mediocre correlation between Age and other property values.
// (Adding new characters to existing scripts tends to split property vectors where
// new characters are similar to old ones.)
// See https://github.com/unicode-org/icu/pull/3025 for details.

inline constexpr uint32_t UPROPS_AGE_MASK = 0xff000000;
inline constexpr int32_t UPROPS_AGE_SHIFT = 24;

Expand Down
70 changes: 70 additions & 0 deletions icu4c/source/tools/toolutil/toolutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,13 @@

#include "unicode/errorcode.h"
#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/umutablecptrie.h"
#include "unicode/ucptrie.h"
#include "cmemory.h"
#include "cstring.h"
#include "toolutil.h"
#include "uassert.h"

U_NAMESPACE_BEGIN

Expand All @@ -82,6 +86,72 @@ void IcuToolErrorCode::handleFailure() const {
exit(errorCode);
}

namespace toolutil {

void setCPTrieBit(UMutableCPTrie *mutableCPTrie,
UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode) {
uint32_t mask = U_MASK(shift);
uint32_t value = on ? mask : 0;
setCPTrieBits(mutableCPTrie, start, end, mask, value, errorCode);
}

void setCPTrieBits(UMutableCPTrie *mutableCPTrie,
UChar32 start, UChar32 end, uint32_t mask, uint32_t value,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return; }
// The value must not have any bits set outside of the mask.
if ((value & ~mask) != 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}

if (start == end) {
uint32_t oldValue = umutablecptrie_get(mutableCPTrie, start);
uint32_t newValue = (oldValue & ~mask) | value;
if (newValue != oldValue) {
umutablecptrie_set(mutableCPTrie, start, newValue, &errorCode);
}
return;
}
while (start <= end && U_SUCCESS(errorCode)) {
uint32_t oldValue;
UChar32 rangeEnd = umutablecptrie_getRange(
mutableCPTrie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &oldValue);
if (rangeEnd > end) {
rangeEnd = end;
}
uint32_t newValue = (oldValue & ~mask) | value;
if (newValue != oldValue) {
umutablecptrie_setRange(mutableCPTrie, start, rangeEnd, newValue, &errorCode);
}
start = rangeEnd + 1;
}
}

int32_t getCPTrieSize(UMutableCPTrie *mt, UCPTrieType type, UCPTrieValueWidth valueWidth) {
UErrorCode errorCode = U_ZERO_ERROR;
UCPTrie *cpTrie = umutablecptrie_buildImmutable(mt, type, valueWidth, &errorCode);
if (U_FAILURE(errorCode)) {
fprintf(stderr,
"toolutil/getCPTrieSize error: umutablecptrie_buildImmutable() failed: %s\n",
u_errorName(errorCode));
return -1;
}
uint8_t block[100000];
int32_t size = ucptrie_toBinary(cpTrie, block, sizeof(block), &errorCode);
ucptrie_close(cpTrie);
if (U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
fprintf(stderr,
"toolutil/getCPTrieSize error: ucptrie_toBinary() failed: %s (length %ld)\n",
u_errorName(errorCode), (long)size);
return -1;
}
U_ASSERT((size & 3) == 0); // multiple of 4 bytes
return size;
}

} // toolutil

U_NAMESPACE_END

static int32_t currentYear = -1;
Expand Down
25 changes: 25 additions & 0 deletions icu4c/source/tools/toolutil/toolutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#ifdef __cplusplus

#include "unicode/errorcode.h"
#include "unicode/umutablecptrie.h"

U_NAMESPACE_BEGIN

Expand All @@ -46,6 +47,30 @@ class U_TOOLUTIL_API IcuToolErrorCode : public ErrorCode {
const char *location;
};

namespace toolutil {

/**
* Sets one bit in the trie values of the start..end range,
* without changing the other bits in the trie values of that range.
*/
U_TOOLUTIL_API void
setCPTrieBit(UMutableCPTrie *mutableCPTrie,
UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode);

/**
* Sets a bit set (defined by the mask) in the trie values of the start..end range,
* without changing the other bits in the trie values of that range.
* The given value must not have any bits set outside of the mask.
*/
U_TOOLUTIL_API void
setCPTrieBits(UMutableCPTrie *mutableCPTrie,
UChar32 start, UChar32 end, uint32_t mask, uint32_t value, UErrorCode &errorCode);

U_TOOLUTIL_API int32_t
getCPTrieSize(UMutableCPTrie *mt, UCPTrieType type, UCPTrieValueWidth valueWidth);

} // toolutil

U_NAMESPACE_END

#endif
Expand Down
49 changes: 8 additions & 41 deletions tools/unicode/c/genprops/emojipropsbuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "cmemory.h"
#include "emojiprops.h"
#include "genprops.h"
#include "toolutil.h"
#include "uassert.h"
#include "unewdata.h"
#include "uparse.h"
Expand Down Expand Up @@ -108,8 +109,6 @@ class EmojiPropsBuilder : public PropsBuilder {
void parsePropsOfStringsLine(char *fields[][2], UErrorCode &errorCode);

private:
void setBit(UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode);
void setBits(UChar32 start, UChar32 end, uint32_t value, uint32_t mask, UErrorCode &errorCode);
void parsePropsOfStringsFile(const char *path, UErrorCode &errorCode);

static int32_t getTrieIndex(int32_t index) {
Expand Down Expand Up @@ -231,48 +230,14 @@ EmojiPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
for (const auto &p2b : propToBinaries) {
U_ASSERT(p2b.shift < 8);
if (newValues.contains(p2b.prop)) {
setBit(props.start, props.end, p2b.shift, props.binProps[p2b.prop], errorCode);
toolutil::setCPTrieBit(mutableCPTrie,
props.start, props.end, p2b.shift, props.binProps[p2b.prop],
errorCode);
}
}
}
}

void
EmojiPropsBuilder::setBit(UChar32 start, UChar32 end, int32_t shift, bool on,
UErrorCode &errorCode) {
uint32_t mask = U_MASK(shift);
uint32_t value = on ? mask : 0;
setBits(start, end, value, mask, errorCode);
}

void
EmojiPropsBuilder::setBits(UChar32 start, UChar32 end, uint32_t value, uint32_t mask,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return; }

if (start == end) {
uint32_t oldValue = umutablecptrie_get(mutableCPTrie, start);
uint32_t newValue = (oldValue & ~mask) | value;
if (newValue != oldValue) {
umutablecptrie_set(mutableCPTrie, start, newValue, &errorCode);
}
return;
}
while (start <= end && U_SUCCESS(errorCode)) {
uint32_t oldValue;
UChar32 rangeEnd = umutablecptrie_getRange(
mutableCPTrie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &oldValue);
if (rangeEnd > end) {
rangeEnd = end;
}
uint32_t newValue = (oldValue & ~mask) | value;
if (newValue != oldValue) {
umutablecptrie_setRange(mutableCPTrie, start, rangeEnd, newValue, &errorCode);
}
start = rangeEnd + 1;
}
}

namespace {

void U_CALLCONV
Expand Down Expand Up @@ -347,7 +312,8 @@ void EmojiPropsBuilder::parsePropsOfStringsLine(char *fields[][2], UErrorCode &e
}
uint32_t start, end;
u_parseCodePointRange(rangeOrString, &start, &end, &errorCode);
setBit(start, end, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
toolutil::setCPTrieBit(mutableCPTrie,
start, end, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
} else {
// Code point or string:
// 23F0 ; Basic_Emoji ; alarm clock
Expand All @@ -371,7 +337,8 @@ void EmojiPropsBuilder::parsePropsOfStringsLine(char *fields[][2], UErrorCode &e
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
setBit(first, first, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
toolutil::setCPTrieBit(mutableCPTrie,
first, first, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
} else {
// more than one code point
UnicodeString us(false, s, length);
Expand Down

0 comments on commit 47e9389

Please sign in to comment.