diff --git a/languages/xpdf-others/ligatures.nameToUnicode b/languages/xpdf-others/ligatures.nameToUnicode index 831b07c..baba5d0 100644 --- a/languages/xpdf-others/ligatures.nameToUnicode +++ b/languages/xpdf-others/ligatures.nameToUnicode @@ -2,5 +2,3 @@ fb00 f_f fb01 f_i fb02 f_l fb03 f_f_i - - diff --git a/src/XmlAltoOutputDev.cc b/src/XmlAltoOutputDev.cc index fb1a7e3..8727156 100644 --- a/src/XmlAltoOutputDev.cc +++ b/src/XmlAltoOutputDev.cc @@ -73,6 +73,7 @@ using namespace icu; #include "Link.h" #include "Catalog.h" #include "Parameters.h" +#include "UnicodeRemapping.h" //#include "Page.h" // PNG lib @@ -1759,6 +1760,9 @@ TextPage::TextPage(GBool verboseA, Catalog *catalog, xmlNodePtr node, root = node; verbose = verboseA; //rawOrder = 1; + remapping = globalParams->getUnicodeRemapping(); + uBufSize = 16; + uBuf = (Unicode *)gmallocn(uBufSize, sizeof(Unicode)); // PL: to modify block order according to reading order if (parameters->getReadingOrder() == gTrue) { @@ -2649,7 +2653,7 @@ void TextPage::addCharToRawWord(GfxState *state, double x, double y, double dx, //cout << "addCharToRawWord" << endl; double x1, y1, w1, h1, dx2, dy2, base, sp, delta; GBool overlap; - int i; + int uBufLen, i; if (uLen == 0) { endWord(); @@ -2705,6 +2709,17 @@ void TextPage::addCharToRawWord(GfxState *state, double x, double y, double dx, return; } + // remap Unicode + uBufLen = 0; + for (i = 0; i < uLen; ++i) { + if (uBufSize - uBufLen < 8 && uBufSize < 20000) { + uBufSize *= 2; + uBuf = (Unicode *)greallocn(uBuf, uBufSize, sizeof(Unicode)); + } + uBufLen += remapping->map(u[i], uBuf + uBufLen, uBufSize - uBufLen); + } + u = uBuf; + uLen = uBufLen; if (!curWord) { beginWord(state, x, y); diff --git a/src/XmlAltoOutputDev.h b/src/XmlAltoOutputDev.h index e39aff5..0192143 100644 --- a/src/XmlAltoOutputDev.h +++ b/src/XmlAltoOutputDev.h @@ -67,6 +67,7 @@ class T3FontCache; struct T3FontCacheTag; class XmlAltoOutputDev; +class UnicodeRemapping; enum ModifierClass { NOT_A_MODIFIER, DIAERESIS, ACUTE_ACCENT, DOUBLE_ACUTE_ACCENT, GRAVE_ACCENT, DOUBLE_GRAVE_ACCENT, BREVE_ACCENT, INVERTED_BREVE_ACCENT, CIRCUMFLEX, TILDE, NORDIC_RING, CZECH_CARON, CEDILLA, DOT_ABOVE, HOOK, HORN, MACRON, OGONEK, @@ -1463,6 +1464,9 @@ class TextPage { /** if the page contains a column of line numbers */ bool lineNumber = false; + UnicodeRemapping *remapping; + Unicode *uBuf; + int uBufSize; // friend class TextBlock; // friend class TextColumn; diff --git a/xpdfrc b/xpdfrc index 077e658..0609810 100644 --- a/xpdfrc +++ b/xpdfrc @@ -60,10 +60,9 @@ unicodeMap TIS-620 languages/xpdf-thai/TIS-620.unicodeMap unicodeMap ISO-8859-9 languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package #----- begin oldstyle support package (2024-dec-31) -nameToUnicode languages/xpdf-others/oldstyle.nameToUnicode -nameToUnicode languages/xpdf-others/taboldstyle.nameToUnicode -nameToUnicode languages/xpdf-others/ligatures.nameToUnicode -nameToUnicode languages/xpdf-others/fitted.nameToUnicode -unicodeRemapping languages/xpdf-others/symbols.unicodeRemapping" +nameToUnicode languages/xpdf-others/oldstyle.nameToUnicode +nameToUnicode languages/xpdf-others/taboldstyle.nameToUnicode +nameToUnicode languages/xpdf-others/ligatures.nameToUnicode +nameToUnicode languages/xpdf-others/fitted.nameToUnicode +unicodeRemapping languages/xpdf-others/symbols.unicodeRemapping #----- end oldstyle support package -