Skip to content

Commit

Permalink
upTeX, upbibtex: Unicode 16.0
Browse files Browse the repository at this point in the history
git-svn-id: svn://tug.org/texlive/trunk/Build/source@72394 c570f23f-e606-0410-a88d-b1316a301751
  • Loading branch information
t-tk committed Sep 28, 2024
1 parent 774a56b commit bf5e678
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 40 deletions.
10 changes: 10 additions & 0 deletions texk/web2c/uptexdir/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
2024-09-28 TANAKA Takuji <[email protected]>

* uptex-m.ch, upbibtex.ch, kanji.c:
Update reference of Unicode blocks to
"Blocks-16.0.0.txt Date: 2024-02-22".
Also add a new block
"CJK Unified Ideographs Extension J"
which is approved in Unicode 17.0, 2025.
* tests/unibib.{tex,bib,bbl}: Update a test.

2024-09-22 TANAKA Takuji <[email protected]>

* ptex-base.ch, ptex_version.h: Version p4.1.2.
Expand Down
71 changes: 41 additions & 30 deletions texk/web2c/uptexdir/kanji.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ integer ktoken_to_chr(integer c)
}

/* Ref. http://www.unicode.org/Public/UNIDATA/Blocks.txt */
/* # Blocks-15.1.0.txt */
/* # Date: 2023-07-28, 15:47:20 GMT */
/* # Blocks-16.0.0.txt */
/* # Date: 2024-02-22 */
static long ucs_range[]={
0x0000, /* Basic Latin */ /* 0x00 */
0x0080, /* Latin-1 Supplement */
Expand Down Expand Up @@ -279,14 +279,15 @@ static long ucs_range[]={
0x10500, /* Elbasan */
0x10530, /* Caucasian Albanian */
0x10570, /* Vithkuqi */
0x105C0, /* Todhri */
0x10600, /* Linear A */
0x10780, /* Latin Extended-F */
0x10800, /* Cypriot Syllabary */
0x10840, /* Imperial Aramaic */
0x10860, /* Palmyrene */
0x10880, /* Nabataean */
0x108E0, /* Hatran */
0x10900, /* Phoenician */ /* 0xc0 */
0x108E0, /* Hatran */ /* 0xc0 */
0x10900, /* Phoenician */
0x10920, /* Lydian */
0x10980, /* Meroitic Hieroglyphs */
0x109A0, /* Meroitic Cursive */
Expand All @@ -301,8 +302,9 @@ static long ucs_range[]={
0x10C00, /* Old Turkic */
0x10C80, /* Old Hungarian */
0x10D00, /* Hanifi Rohingya */
0x10D40, /* Garay */ /* 0xd0 */
0x10E60, /* Rumi Numeral Symbols */
0x10E80, /* Yezidi */ /* 0xd0 */
0x10E80, /* Yezidi */
0x10EC0, /* Arabic Extended-C */
0x10F00, /* Old Sogdian */
0x10F30, /* Sogdian */
Expand All @@ -316,26 +318,29 @@ static long ucs_range[]={
0x11150, /* Mahajani */
0x11180, /* Sharada */
0x111E0, /* Sinhala Archaic Numbers */
0x11200, /* Khojki */
0x11200, /* Khojki */ /* 0xe0 */
0x11280, /* Multani */
0x112B0, /* Khudawadi */ /* 0xe0 */
0x112B0, /* Khudawadi */
0x11300, /* Grantha */
0x11380, /* Tulu-Tigalaria */
0x11400, /* Newa */
0x11480, /* Tirhuta */
0x11580, /* Siddham */
0x11600, /* Modi */
0x11660, /* Mongolian Supplement */
0x11680, /* Takri */
0x116D0, /* Myanmar Extended-C */
0x11700, /* Ahom */
0x11800, /* Dogra */
0x118A0, /* Warang Citi */
0x11900, /* Dives Akuru */
0x119A0, /* Nandinagari */
0x119A0, /* Nandinagari */ /* 0xf0 */
0x11A00, /* Zanabazar Square */
0x11A50, /* Soyombo */
0x11AB0, /* Unified Canadian Aboriginal Syllabics Extended-A */
0x11AC0, /* Pau Cin Hau */ /* 0xf0 */
0x11AC0, /* Pau Cin Hau */
0x11B00, /* Devanagari Extended-A */
0x11BC0, /* Sunuwar */
0x11C00, /* Bhaiksuki */
0x11C70, /* Marchen */
0x11D00, /* Masaram Gondi */
Expand All @@ -345,35 +350,39 @@ static long ucs_range[]={
0x11FB0, /* Lisu Supplement */
0x11FC0, /* Tamil Supplement */
0x12000, /* Cuneiform */
0x12400, /* Cuneiform Numbers and Punctuation */
0x12400, /* Cuneiform Numbers and Punctuation */ /* 0x100 */
0x12480, /* Early Dynastic Cuneiform */
0x12F90, /* Cypro-Minoan */
0x13000, /* Egyptian Hieroglyphs */
0x13430, /* Egyptian Hieroglyph Format Controls */
0x14400, /* Anatolian Hieroglyphs */ /* 0x100 */
0x13460, /* Egyptian Hieroglyphs Extended-A */
0x14400, /* Anatolian Hieroglyphs */
0x16100, /* Gurung Khema */
0x16800, /* Bamum Supplement */
0x16A40, /* Mro */
0x16A70, /* Tangsa */
0x16AD0, /* Bassa Vah */
0x16B00, /* Pahawh Hmong */
0x16D40, /* Kirat Rai */
0x16E40, /* Medefaidrin */
0x16F00, /* Miao */
0x16FE0, /* Ideographic Symbols and Punctuation */
0x16FE0, /* Ideographic Symbols and Punctuation */ /* 0x110 */
0x17000, /* Tangut */
0x18800, /* Tangut Components */
0x18B00, /* Khitan Small Script */
0x18D00, /* Tangut Supplement */
0x1AFF0, /* Kana Extended-B */
0x1B000, /* Kana Supplement */
0x1B100, /* Kana Extended-A */
0x1B130, /* Small Kana Extension */ /* 0x110 */
0x1B130, /* Small Kana Extension */
0x1B170, /* Nushu */
0x1BC00, /* Duployan */
0x1BCA0, /* Shorthand Format Controls */
0x1CC00, /* Symbols for Legacy Computing Supplement */
0x1CF00, /* Znamenny Musical Notation */
0x1D000, /* Byzantine Musical Symbols */
0x1D100, /* Musical Symbols */
0x1D200, /* Ancient Greek Musical Notation */
0x1D200, /* Ancient Greek Musical Notation */ /* 0x120 */
0x1D2C0, /* Kaktovik Numerals */
0x1D2E0, /* Mayan Numerals */
0x1D300, /* Tai Xuan Jing Symbols */
Expand All @@ -382,13 +391,14 @@ static long ucs_range[]={
0x1D800, /* Sutton SignWriting */
0x1DF00, /* Latin Extended-G */
0x1E000, /* Glagolitic Supplement */
0x1E030, /* Cyrillic Extended-D */ /* 0x120 */
0x1E030, /* Cyrillic Extended-D */
0x1E100, /* Nyiakeng Puachue Hmong */
0x1E290, /* Toto */
0x1E2C0, /* Wancho */
0x1E4D0, /* Nag Mundari */
0x1E5D0, /* Ol Onal */
0x1E7E0, /* Ethiopic Extended-B */
0x1E800, /* Mende Kikakui */
0x1E800, /* Mende Kikakui */ /* 0x130 */
0x1E900, /* Adlam */
0x1EC70, /* Indic Siyaq Numbers */
0x1ED00, /* Ottoman Siyaq Numbers */
Expand All @@ -398,13 +408,13 @@ static long ucs_range[]={
0x1F0A0, /* Playing Cards */
0x1F100, /* Enclosed Alphanumeric Supplement */
0x1F200, /* Enclosed Ideographic Supplement */
0x1F300, /* Miscellaneous Symbols and Pictographs */ /* 0x130 */
0x1F300, /* Miscellaneous Symbols and Pictographs */
0x1F600, /* Emoticons */
0x1F650, /* Ornamental Dingbats */
0x1F680, /* Transport and Map Symbols */
0x1F700, /* Alchemical Symbols */
0x1F780, /* Geometric Shapes Extended */
0x1F800, /* Supplemental Arrows-C */
0x1F800, /* Supplemental Arrows-C */ /* 0x140 */
0x1F900, /* Supplemental Symbols and Pictographs */
0x1FA00, /* Chess Symbols */
0x1FA70, /* Symbols and Pictographs Extended-A */
Expand All @@ -414,12 +424,13 @@ static long ucs_range[]={
0x2B740, /* CJK Unified Ideographs Extension D */
0x2B820, /* CJK Unified Ideographs Extension E */
0x2CEB0, /* CJK Unified Ideographs Extension F */
0x2EBF0, /* CJK Unified Ideographs Extension I */ /* 0x140 */
0x2EBF0, /* CJK Unified Ideographs Extension I */
0x2F800, /* CJK Compatibility Ideographs Supplement */
0x30000, /* CJK Unified Ideographs Extension G */
0x31350, /* CJK Unified Ideographs Extension H */
0x323B0, /* reserved */
0x40000, /* reserved */
0x323B0, /* CJK Unified Ideographs Extension J (Unicode 17.0) */
0x33480, /* reserved */
0x40000, /* reserved */ /* 0x150 */
0x50000, /* reserved */
0x60000, /* reserved */
0x70000, /* reserved */
Expand All @@ -430,14 +441,14 @@ static long ucs_range[]={
0xC0000, /* reserved */
0xD0000, /* reserved */
0xE0000, /* Tags */
0xE0100, /* Variation Selectors Supplement */ /* 0x150 */
0xE0100, /* Variation Selectors Supplement */
0xE01F0, /* reserved */
0xF0000, /* Supplementary Private Use Area-A */
0x100000, /* Supplementary Private Use Area-B */
/* Value over 0x10FFFF is illegal under Unicode,
They are for some special use. *** experimental *** */
We use for some special application. */
0x110000, /* Reserved */
0x120000, /* Reserved */
0x120000, /* Reserved */ /* 0x160 */
0x130000, /* Reserved */
0x140000, /* Reserved */
0x150000, /* Reserved */
Expand All @@ -448,20 +459,20 @@ static long ucs_range[]={
0x1A0000, /* Reserved */
0x1B0000, /* Reserved */
0x1C0000, /* Reserved */
0x1D0000, /* Reserved */ /* 0x160 */
0x1D0000, /* Reserved */
0x1E0000, /* Reserved */
0x1F0000, /* Reserved */
0x200000, /* Reserved */
0x210000, /* Reserved */
0x220000, /* Kana with Voiced Sound Mark */
0x220000, /* Kana with Voiced Sound Mark */ /* 0x170 */
0x240000, /* Kana with Semi-Voiced Sound Mark */
0x25E6E6, /* Emoji Flag Sequence */
0x260000, /* Emoji with Modifier Fitzpatrick */
0x300000, /* Reserved */
0x400000, /* Standardized Variation Sequence */
0x800000, /* Emoji Keycap Sequence */
0x800080, /* Ideographic Variation Sequence */ /* 0x16C */
CJK_CHAR_LIMIT, /* Ideographic Variation Sequence, VS49..VS256 */
0x800080, /* Ideographic Variation Sequence, VS17..VS48 */
CJK_CHAR_LIMIT, /* Ideographic Variation Sequence, VS49..VS256 */ /* 0x178 */
IVS_CHAR_LIMIT
};

Expand Down Expand Up @@ -544,11 +555,11 @@ integer kcatcodekey(integer c)
if ( COMBINING_ENCLOSING_KEYCAP==c )
return 0x1FA;
break;
case 0x12e: /* Block : Enclosed Alphanumeric Supplement */
case 0x138: /* Block : Enclosed Alphanumeric Supplement */
if ( REGIONAL_INDICATOR_SYMBOL_LETTER_A <=c && c<= REGIONAL_INDICATOR_SYMBOL_LETTER_Z )
return 0x1FB;
break;
case 0x130: /* Block : Miscellaneous Symbols and Pictographs */
case 0x13a: /* Block : Miscellaneous Symbols and Pictographs */
if ( EMOJI_MODIFIER_FITZPATRIC_TYPE1_2 <=c && c<= EMOJI_MODIFIER_FITZPATRIC_TYPE6 )
return 0x1FC;
break;
Expand Down
7 changes: 6 additions & 1 deletion texk/web2c/uptexdir/tests/unibib.bbl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
\begin{thebibliography}{1}
\begin{thebibliography}{10}

\bibitem{björk2020}
Tomas Björk.
Expand All @@ -22,6 +22,11 @@ Dustin Boswell and Trevor Foucher.
\newblock 구운몽.
\newblock 高麗書林, 1975.

\bibitem{imahashi}
𫝆𫞎姬, 𡈽𪧦, 𠮷田真.
\newblock CJK統合漢字拡張🄑--🄓のテスト --- その🄰.
\newblock 冬𠘨𫝷, 2024.

\bibitem{nikos}
Νίκος Καζαντζάκης.
\newblock {\em Συμπόσιον}.
Expand Down
22 changes: 22 additions & 0 deletions texk/web2c/uptexdir/tests/unibib.bib
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,25 @@ @book{kigou
publisher = "Snowman commedian Press",
year = 2020,
}

%% CJK統合漢字拡張B-Dのテスト
@book{imahashi,
author = "𫝆𫞎 姬 and 𡈽 𪧦 and 𠮷田 真",
yomi = "Hime Imahashi",
title = "CJK統合漢字拡張🄑--🄓のテスト --- その🄰",
publisher = "冬𠘨𫝷",
year = 2024,
}
% 𫝆 U+2B746 CJK Unified Ideographs Extension D AJ:13780
% 𫞎 U+2B78E CJK Unified Ideographs Extension D AJ:13724
% 姬 U+2F862 CJK Compatibility Ideographs Supplement AJ:13998
% 𡈽 U+2123D CJK Unified Ideographs Extension B AJ:13953 J3-2F42
% 𪧦 U+2A9E6 CJK Unified Ideographs Extension C AJ:14145
% 𠮷 U+20BB7 CJK Unified Ideographs Extension B AJ:13706
% 真 U+2F947 CJK Compatibility Ideographs Supplement AJ:13854
% 冬 U+2F81A CJK Compatibility Ideographs Supplement AJ:13954
% 𠘨 U+20628 CJK Unified Ideographs Extension B AJ:14105 J4-2332
% 𫝷 U+2B777 CJK Unified Ideographs Extension D AJ:13782
% 🄑 U+1F111 Parenthesized Latin Capital Letter B
% 🄓 U+1F113 Parenthesized Latin Capital Letter B
% 🄰 U+1F130 Squared Latin Capital Letter A
9 changes: 9 additions & 0 deletions texk/web2c/uptexdir/tests/unibib.tex
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@
publisher = "Snowman commedian Press",
year = 2020,
}

%% CJK統合漢字拡張B-Dのテスト
@book{imahashi,
author = "𫝆𫞎 姬 and 𡈽 𪧦 and 𠮷田 真",
yomi = "Hime Imahashi",
title = "CJK統合漢字拡張🄑--🄓のテスト --- その🄰",
publisher = "冬𠘨𫝷",
year = 2024,
}
\end{filecontents}

\documentclass{ltjsarticle}
Expand Down
6 changes: 3 additions & 3 deletions texk/web2c/uptexdir/upbibtex.ch
Original file line number Diff line number Diff line change
Expand Up @@ -968,7 +968,7 @@ function is_char_kanji_upbibtex(@!c:integer):boolean;
label exit;
var k:integer;
begin
{ based on upTeX-1.30 kcatcode status: 16,17,19->true / 15,18->false }
{ based on upTeX-1.35 kcatcode status: 16,17,19->true / 15,18->false }
is_char_kanji_upbibtex := true;
if (is_internalUPTEX) then begin { should be in sync with |kcat_code| of uptex-m.ch }
k := kcatcodekey(c);
Expand All @@ -985,8 +985,8 @@ begin
else if k=@"93 then return { Hangul Syllables }
else if k=@"94 then return { Hangul Jamo Extended-B }
else if k=@"99 then return { CJK Compatibility Ideographs }
else if (k>=@"10D)and(k<=@"110) then return { Kana Extended-B .. Small Kana Extension }
else if (k>=@"13B)and(k<=@"143) then return { CJK Unified Ideographs Extension B .. H }
else if (k>=@"115)and(k<=@"118) then return { Kana Extended-B .. Small Kana Extension }
else if (k>=@"145)and(k<=@"14F) then return { CJK Unified Ideographs Extension B .. J }
else if k=@"1FE then return { Fullwidth digit and latin alphabet }
else if k=@"1FF then return; { Halfwidth katakana }
end
Expand Down
12 changes: 6 additions & 6 deletions texk/web2c/uptexdir/uptex-m.ch
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,12 @@ if (isinternalUPTEX) then begin
@t\hskip10pt@>kcat_code(@"99):=kanji; { CJK Compatibility Ideographs }
@t\hskip10pt@>kcat_code(@"9C):=modifier; { Variation Selectors }
{ \hskip10pt|kcat_code(@"A2):=other_kchar;| Halfwidth and Fullwidth Forms }
@+@t\1@>for k:=@"10D to @"110 do kcat_code(k):=kana; { Kana Extended-B .. Small Kana Extension }
@+@t\1@>for k:=@"13B to @"143 do kcat_code(k):=kanji; { CJK Unified Ideographs Extension B .. H }
@t\hskip10pt@>kcat_code(@"150):=modifier; { Variation Selectors Supplement }
@+@t\1@>for k:=@"165 to @"166 do kcat_code(k):=kana; { Kana with (Semi-)Voiced Sound Mark }
@t\hskip10pt@>kcat_code(@"16A):=kanji; { Standardized Variation Sequence }
@+@t\1@>for k:=@"16C to @"16D do kcat_code(k):=kanji; { Ideographic Variation Sequence }
@+@t\1@>for k:=@"115 to @"118 do kcat_code(k):=kana; { Kana Extended-B .. Small Kana Extension }
@+@t\1@>for k:=@"145 to @"14F do kcat_code(k):=kanji; { CJK Unified Ideographs Extension B .. J }
@t\hskip10pt@>kcat_code(@"15B):=modifier; { Variation Selectors Supplement }
@+@t\1@>for k:=@"170 to @"171 do kcat_code(k):=kana; { Kana with (Semi-)Voiced Sound Mark }
@t\hskip10pt@>kcat_code(@"175):=kanji; { Standardized Variation Sequence }
@+@t\1@>for k:=@"177 to @"178 do kcat_code(k):=kanji; { Ideographic Variation Sequence }
@+@t\1@>for k:=@"1F9 to @"1FC do kcat_code(k):=modifier;
{ Combining Katakana-Hiragana (Semi-)Voiced Sound Mark .. Emoji Modifier Fitzpatrick }
@t\hskip10pt@>kcat_code(@"1FD):=not_cjk; { Latin-1 Letters }
Expand Down

0 comments on commit bf5e678

Please sign in to comment.