diff --git a/.gitignore b/.gitignore index fa75c0e..5753201 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,5 @@ traingenerator jeu/* build/ install/ +.ninja_* +build.ninja diff --git a/.gitmodules b/.gitmodules index e4b6e25..38c2d14 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "xpdf-4.03"] - path = xpdf-4.03 - url = git@github.com:kermitt2/xpdf-4.03.git +[submodule "xpdf-4.05"] + path = xpdf-4.05 + url = https://github.com/lfoppiano/xpdf-4.05.git \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 13f08a5..0000000 --- a/.travis.yml +++ /dev/null @@ -1,37 +0,0 @@ -matrix: - include: - - os: linux - - os: osx - osx_image: xcode9.4 - -dist: trusty -sudo: true -language: - - cpp -compiler: - - clang - -before_install: - - eval "${MATRIX_EVAL}" - - git submodule update --init --recursive; - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then - sudo add-apt-repository ppa:beineri/opt-qt571-trusty -y; - sudo apt-get update -qq; - sudo apt-get install -qq cmake build-essential; - fi - - if [ "$TRAVIS_OS_NAME" = "osx" ]; then - brew update; - brew upgrade cmake; - fi - -script: - - gcc -v && g++ -v && cmake --version - - cmake ./; - - make; - -notifications: - email: true - -git: - submodules: true - \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..34da903 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,65 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). + +## [0.5] -TBD +- update to xpdf-4.05 + +## [0.4] + +- support for xpdf language support package for language-specific fonts like Arabic, Chinese-simplified, Japanese, etc. they are pre-installed locally and portable + +- refined line number detection and fixing a bug which could result in random missing numbers in the ALTO output + +- update to xpdf-4.03 + +- fix issue with character spacing due to invalid rotation condition + +- update dependencies and dependency install script + +## [0.3] + + +- line number detection: line numbers (typically added for review in manuscripts/preprints) are specifically identified and not anymore mixed with the rest of text content, they will be grouped in a separate block or, optionally, not outputted in the ALTO file (`noLineNumbers` option) + +- removal of `-blocks` option, the block information are always returned for ensuring ALTO validation (`` element) + +- bug fixing on reading order + +- fix possible incorrect XMax and YMax values at 0 on block coordinates having only one line + +## [0.2] + + +- support Unicode composition of characters + +- generalize reading order to all blocks (it was limited to the blocks of the first page) + +- detect subscript/superscript text font style attribute + +- use SVG as a format for vectorial images + +- propagate unsolved character Unicode value (free Unicode range for embedded fonts) as encoded special character in ALTO (so-called "placeholder" approach) + +- generate metadata information in a separate XML file (as ALTO schema does not support that) + +- use the latest version of xpdf, version 4.00 + +- add cmake + +- [ALTO](https://github.com/altoxml/documentation/wiki) output is replacing custom Xerox XML format + +- Note: this released version was used for Grobid release 0.5.6 + +## [0.1] + +- encode URI (using `xmlURIEscape` from libxml2) for the @href attribute content to avoid blocking XML wellformedness issues. From our experiments, this problem happens in average for 2-3 scholar PDF out of one thousand. +- output coordinates attributes for the BLOCK elements when the `-block` option is selected, +- add a parameter `-readingOrder` which re-order the blocks following the reading order when the -block option is selected. By default in pdf2xml, the elements followed the PDF content stream (the so-called _raw order_). In xpdf, several text flow orders are available including the raw order and the reading order. Note that, with this modification and this new option, only the blocks are re-ordered. + From our experiments, the raw order can diverge quite significantly from the order of elements according to the visual/reading layout in 2-4% of scholar PDF (e.g. title element is introduced at the end of the page element, while visually present at the top of the page), and minor changes can be present in up to 100% of PDF for some scientific publishers (e.g. headnote introduced at the end of the page content). This additional mode can be thus quite useful for information/structure extraction applications exploiting pdfalto output. + +- use the latest version of xpdf, version 3.04. + + \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index e444d33..cfc0f70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,13 @@ -cmake_minimum_required(VERSION 3.5+) +cmake_minimum_required(VERSION 3.10) project(pdfalto) set(CMAKE_CXX_STANDARD 11) set(CMAKE_EXE_LINKER_FLAGS "-no-pie") set(CMAKE_BUILD_TYPE "Release") +# Set the SDK path +set(CMAKE_OSX_SYSROOT /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk) + #--- look for fontconfig if (NOT NO_FONTCONFIG) find_library(FONTCONFIG_LIBRARY @@ -24,7 +27,7 @@ else () endif () #build xpdf -set ( XPDF_SUBDIR ${CMAKE_CURRENT_SOURCE_DIR}/xpdf-4.03) +set ( XPDF_SUBDIR ${CMAKE_CURRENT_SOURCE_DIR}/xpdf-4.05) set ( IMAGE_SUBDIR ${CMAKE_CURRENT_SOURCE_DIR}/libs/image) diff --git a/Readme.md b/Readme.md index 8f12dcd..49e95f1 100644 --- a/Readme.md +++ b/Readme.md @@ -115,62 +115,7 @@ languages pdfalto xpdfrc # Changes -New in version 0.4 (apart various bug fixes): - -- support for xpdf language support package for language-specific fonts like Arabic, Chinese-simplified, Japanese, etc. they are pre-installed locally and portable - -- refined line number detection and fixing a bug which could result in random missing numbers in the ALTO output - -- update to xpdf-4.03 - -- fix issue with character spacing due to invalid rotation condition - -- update dependencies and dependency install script - -New in version 0.3 (apart various bug fixes): - -- line number detection: line numbers (typically added for review in manuscripts/preprints) are specifically identified and not anymore mixed with the rest of text content, they will be grouped in a separate block or, optionally, not outputted in the ALTO file (`noLineNumbers` option) - -- removal of `-blocks` option, the block information are always returned for ensuring ALTO validation (`` element) - -- bug fixing on reading order - -- fix possible incorrect XMax and YMax values at 0 on block coordinates having only one line - -New in version 0.2 (apart various bug fixes): - -- support Unicode composition of characters - -- generalize reading order to all blocks (it was limited to the blocks of the first page) - -- detect subscript/superscript text font style attribute - -- use SVG as a format for vectorial images - -- propagate unsolved character Unicode value (free Unicode range for embedded fonts) as encoded special character in ALTO (so-called "placeholder" approach) - -- generate metadata information in a separate XML file (as ALTO schema does not support that) - -- use the latest version of xpdf, version 4.00 - -- add cmake - -- [ALTO](https://github.com/altoxml/documentation/wiki) output is replacing custom Xerox XML format - -- Note: this released version was used for Grobid release 0.5.6 - -New in version 0.1 (apart various bug fixes): - -- encode URI (using `xmlURIEscape` from libxml2) for the @href attribute content to avoid blocking XML wellformedness issues. From our experiments, this problem happens in average for 2-3 scholar PDF out of one thousand. - -- output coordinates attributes for the BLOCK elements when the `-block` option is selected, - -- add a parameter `-readingOrder` which re-order the blocks following the reading order when the -block option is selected. By default in pdf2xml, the elements followed the PDF content stream (the so-called _raw order_). In xpdf, several text flow orders are available including the raw order and the reading order. Note that, with this modification and this new option, only the blocks are re-ordered. - - From our experiments, the raw order can diverge quite significantly from the order of elements according to the visual/reading layout in 2-4% of scholar PDF (e.g. title element is introduced at the end of the page element, while visually present at the top of the page), and minor changes can be present in up to 100% of PDF for some scientific publishers (e.g. headnote introduced at the end of the page content). This additional mode can be thus quite useful for information/structure extraction applications exploiting pdfalto output. - -- use the latest version of xpdf, version 3.04. - +All changes are in the [CHANGELOG.md](CHANGELOG.md) # Contributors diff --git a/languages/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode b/languages/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode index bed3f12..34d191c 100644 --- a/languages/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode +++ b/languages/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode @@ -812,100 +812,100 @@ fe34 2549 254a 254b -ff01 -ff02 -ff03 -ffe5 -ff05 -ff06 -ff07 -ff08 -ff09 -ff0a -ff0b -ff0c -ff0d -ff0e -ff0f -ff10 -ff11 -ff12 -ff13 -ff14 -ff15 -ff16 -ff17 -ff18 -ff19 -ff1a -ff1b -ff1c -ff1d -ff1e -ff1f -ff20 -ff21 -ff22 -ff23 -ff24 -ff25 -ff26 -ff27 -ff28 -ff29 -ff2a -ff2b -ff2c -ff2d -ff2e -ff2f -ff30 -ff31 -ff32 -ff33 -ff34 -ff35 -ff36 -ff37 -ff38 -ff39 -ff3a -ff3b -ff3c -ff3d -ff3e -ff3f -ff40 -ff41 -ff42 -ff43 -ff44 -ff45 -ff46 -ff47 -ff48 -ff49 -ff4a -ff4b -ff4c -ff4d -ff4e -ff4f -ff50 -ff51 -ff52 -ff53 -ff54 -ff55 -ff56 -ff57 -ff58 -ff59 -ff5a -ff5b -ff5c -ff5d -ffe3 +0021 +0022 +0023 +00a5 +0025 +0026 +0027 +0028 +0029 +002a +002b +002c +002d +002e +002f +0030 +0031 +0032 +0033 +0034 +0035 +0036 +0037 +0038 +0039 +003a +003b +003c +003d +003e +003f +0040 +0041 +0042 +0043 +0044 +0045 +0046 +0047 +0048 +0049 +004a +004b +004c +004d +004e +004f +0050 +0051 +0052 +0053 +0054 +0055 +0056 +0057 +0058 +0059 +005a +005b +005c +005d +005e +005f +0060 +0061 +0062 +0063 +0064 +0065 +0066 +0067 +0068 +0069 +006a +006b +006c +006d +006e +006f +0070 +0071 +0072 +0073 +0074 +0075 +0076 +0077 +0078 +0079 +007a +007b +007c +007d +00af 0101 00e1 01ce diff --git a/languages/xpdf-chinese-simplified/README b/languages/xpdf-chinese-simplified/README index 6c7c95b..f02df51 100644 --- a/languages/xpdf-chinese-simplified/README +++ b/languages/xpdf-chinese-simplified/README @@ -2,7 +2,7 @@ Xpdf: Chinese Simplified support package ======================================== Xpdf project: http://www.foolabs.com/xpdf/ -2020-dec-22 +2023-dec-05 If this package includes CMap files, they contain their own copyright notices and distribution conditions. All other files in the package diff --git a/languages/xpdf-korean/Adobe-KR.cidToUnicode b/languages/xpdf-korean/Adobe-KR.cidToUnicode index bf28e83..97ab572 100644 --- a/languages/xpdf-korean/Adobe-KR.cidToUnicode +++ b/languages/xpdf-korean/Adobe-KR.cidToUnicode @@ -11770,99 +11770,99 @@ fe43 fe44 fe47 fe48 -0021 -0022 -0023 -0024 -0025 -0026 -0027 -0028 -0029 -002a -002b -002c -00ad -002e -002f -0030 -0031 -0032 -0033 -0034 -0035 -0036 -0037 -0038 -0039 -003a -003b -003c -003d -003e -003f -0040 -0041 -0042 -0043 -0044 -0045 -0046 -0047 -0048 -0049 -004a -004b -004c -004d -004e -004f -0050 -0051 -0052 -0053 -0054 -0055 -0056 -0057 -0058 -0059 -005a -005b -005c -005d -005e -005f -0060 -0061 -0062 -0063 -0064 -0065 -0066 -0067 -0068 -0069 -006a -006b -006c -006d -006e -006f -0070 -0071 -0072 -0073 -0074 -0075 -0076 -0077 -0078 -0079 -007a -007b -007c -007d +ff01 +ff02 +ff03 +ff04 +ff05 +ff06 +ff07 +ff08 +ff09 +ff0a +ff0b +ff0c +ff0d +ff0e +ff0f +ff10 +ff11 +ff12 +ff13 +ff14 +ff15 +ff16 +ff17 +ff18 +ff19 +ff1a +ff1b +ff1c +ff1d +ff1e +ff1f +ff20 +ff21 +ff22 +ff23 +ff24 +ff25 +ff26 +ff27 +ff28 +ff29 +ff2a +ff2b +ff2c +ff2d +ff2e +ff2f +ff30 +ff31 +ff32 +ff33 +ff34 +ff35 +ff36 +ff37 +ff38 +ff39 +ff3a +ff3b +ff3c +ff3d +ff3e +ff3f +ff40 +ff41 +ff42 +ff43 +ff44 +ff45 +ff46 +ff47 +ff48 +ff49 +ff4a +ff4b +ff4c +ff4d +ff4e +ff4f +ff50 +ff51 +ff52 +ff53 +ff54 +ff55 +ff56 +ff57 +ff58 +ff59 +ff5a +ff5b +ff5c +ff5d ffe3 ffe6 0000 diff --git a/languages/xpdf-korean/README b/languages/xpdf-korean/README index 37fdc9e..05ab13d 100644 --- a/languages/xpdf-korean/README +++ b/languages/xpdf-korean/README @@ -2,7 +2,7 @@ Xpdf: Korean support package ============================ Xpdf project: http://www.foolabs.com/xpdf/ -2020-dec-22 +2023-dec-05 If this package includes CMap files, they contain their own copyright notices and distribution conditions. All other files in the package diff --git a/languages/xpdf-others/fitted.nameToUnicode b/languages/xpdf-others/fitted.nameToUnicode new file mode 100644 index 0000000..9aba076 --- /dev/null +++ b/languages/xpdf-others/fitted.nameToUnicode @@ -0,0 +1,7 @@ +0030 zero.fitted +0031 one.fitted +0032 two.fitted +0033 three.fitted +0034 four.fitted +0035 five.fitted +0036 six.fitted \ No newline at end of file diff --git a/languages/xpdf-others/ligatures.nameToUnicode b/languages/xpdf-others/ligatures.nameToUnicode new file mode 100644 index 0000000..baba5d0 --- /dev/null +++ b/languages/xpdf-others/ligatures.nameToUnicode @@ -0,0 +1,4 @@ +fb00 f_f +fb01 f_i +fb02 f_l +fb03 f_f_i diff --git a/languages/xpdf-others/oldstyle.nameToUnicode b/languages/xpdf-others/oldstyle.nameToUnicode new file mode 100644 index 0000000..c5905e6 --- /dev/null +++ b/languages/xpdf-others/oldstyle.nameToUnicode @@ -0,0 +1,20 @@ +0030 zero.oldstyle +0031 one.oldstyle +0032 two.oldstyle +0033 three.oldstyle +0034 four.oldstyle +0035 five.oldstyle +0036 six.oldstyle +0037 seven.oldstyle +0038 eight.oldstyle +0039 nine.oldstyle +f730 zero.oldstyle +f731 one.oldstyle +f732 two.oldstyle +f733 three.oldstyle +f734 four.oldstyle +f735 five.oldstyle +f736 six.oldstyle +f737 seven.oldstyle +f738 eight.oldstyle +f739 nine.oldstyle \ No newline at end of file diff --git a/languages/xpdf-others/symbols.unicodeRemapping b/languages/xpdf-others/symbols.unicodeRemapping new file mode 100644 index 0000000..46c6635 --- /dev/null +++ b/languages/xpdf-others/symbols.unicodeRemapping @@ -0,0 +1,4 @@ +f057 2126 +f057 03a9 +f02d 208B +f0a3 2264 \ No newline at end of file diff --git a/languages/xpdf-others/taboldstyle.nameToUnicode b/languages/xpdf-others/taboldstyle.nameToUnicode new file mode 100644 index 0000000..95fbb2b --- /dev/null +++ b/languages/xpdf-others/taboldstyle.nameToUnicode @@ -0,0 +1 @@ +0038 eight.taboldstyle \ No newline at end of file diff --git a/languages/xpdfrc b/languages/xpdfrc deleted file mode 100644 index 895c5de..0000000 --- a/languages/xpdfrc +++ /dev/null @@ -1,61 +0,0 @@ -#----- begin Arabic support package (2011-aug-15) -unicodeMap ISO-8859-6 languages/xpdf-arabic/ISO-8859-6.unicodeMap -#----- end Arabic support package -#----- begin Chinese Simplified support package (2011-sep-02) -cidToUnicode Adobe-GB1 languages/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode -unicodeMap ISO-2022-CN languages/xpdf-chinese-simplified/ISO-2022-CN.unicodeMap -unicodeMap EUC-CN languages/xpdf-chinese-simplified/EUC-CN.unicodeMap -unicodeMap GBK languages/xpdf-chinese-simplified/GBK.unicodeMap -cMapDir Adobe-GB1 languages/xpdf-chinese-simplified/CMap -toUnicodeDir languages/xpdf-chinese-simplified/CMap -#fontFileCC Adobe-GB1 /usr/..../NotoSansCJKsc-Regular.otf -#----- end Chinese Simplified support package -#----- begin Chinese Traditional support package (2011-sep-02) -cidToUnicode Adobe-CNS1 languages/xpdf-chinese-traditional/Adobe-CNS1.cidToUnicode -unicodeMap Big5 languages/xpdf-chinese-traditional/Big5.unicodeMap -unicodeMap Big5ascii languages/xpdf-chinese-traditional/Big5ascii.unicodeMap -cMapDir Adobe-CNS1 languages/xpdf-chinese-traditional/CMap -toUnicodeDir languages/xpdf-chinese-traditional/CMap -#fontFileCC Adobe-CNS1 /usr/..../NotoSansCJKtc-Regular.otf" -#----- end Chinese Traditional support package -#----- begin Cyrillic support package (2011-aug-15) -nameToUnicode languages/xpdf-cyrillic/Bulgarian.nameToUnicode -unicodeMap KOI8-R languages/xpdf-cyrillic/KOI8-R.unicodeMap -#----- end Cyrillic support package -#----- begin Greek support package (2011-aug-15) -nameToUnicode languages/xpdf-greek/Greek.nameToUnicode -unicodeMap ISO-8859-7 languages/xpdf-greek/ISO-8859-7.unicodeMap -#----- end Greek support package -#----- begin Hebrew support package (2011-aug-15) -unicodeMap ISO-8859-8 languages/xpdf-hebrew/ISO-8859-8.unicodeMap -unicodeMap Windows-1255 languages/xpdf-hebrew/Windows-1255.unicodeMap -#----- end Hebrew support package -#----- begin Japanese support package (2011-sep-02) -cidToUnicode Adobe-Japan1 languages/xpdf-japanese/Adobe-Japan1.cidToUnicode -unicodeMap ISO-2022-JP languages/xpdf-japanese/ISO-2022-JP.unicodeMap -unicodeMap EUC-JP languages/xpdf-japanese/EUC-JP.unicodeMap -unicodeMap Shift-JIS languages/xpdf-japanese/Shift-JIS.unicodeMap -cMapDir Adobe-Japan1 languages/xpdf-japanese/CMap -toUnicodeDir languages/xpdf-japanese/CMap -#fontFileCC Adobe-Japan1 /usr/..../NotoSansCJKjp-Regular.otf -#----- end Japanese support package -#----- begin Korean support package (2011-sep-02) -cidToUnicode Adobe-Korea1 languages/xpdf-korean/Adobe-Korea1.cidToUnicode -cidToUnicode Adobe-KR languages/xpdf-korean/Adobe-KR.cidToUnicode -unicodeMap ISO-2022-KR languages/xpdf-korean/ISO-2022-KR.unicodeMap -cMapDir Adobe-Korea1 languages/xpdf-korean/CMap -cMapDir Adobe-KR languages/xpdf-korean/CMap -toUnicodeDir languages/xpdf-korean/CMap -#fontFileCC Adobe-Korea1 /usr/..../NotoSansCJKkr-Regular.otf -#fontFileCC Adobe-KR /usr/..../NotoSansCJKkr-Regular.otf -#----- end Korean support package -#----- begin Latin2 support package (2011-aug-15) -unicodeMap Latin2 languages/xpdf-latin2/Latin2.unicodeMap -#----- end Latin2 support package -#----- begin Thai support package (2011-aug-15) -nameToUnicode languages/xpdf-thai/Thai.nameToUnicode -unicodeMap TIS-620 languages/xpdf-thai/TIS-620.unicodeMap -#----- end Thai support package -#----- begin Turkish support package (2011-aug-15) -unicodeMap ISO-8859-9 languages/xpdf-turkish/ISO-8859-9.unicodeMap -#----- end Turkish support package diff --git a/src/XmlAltoOutputDev.cc b/src/XmlAltoOutputDev.cc index f2f3fb9..8727156 100644 --- a/src/XmlAltoOutputDev.cc +++ b/src/XmlAltoOutputDev.cc @@ -73,6 +73,7 @@ using namespace icu; #include "Link.h" #include "Catalog.h" #include "Parameters.h" +#include "UnicodeRemapping.h" //#include "Page.h" // PNG lib @@ -1759,6 +1760,9 @@ TextPage::TextPage(GBool verboseA, Catalog *catalog, xmlNodePtr node, root = node; verbose = verboseA; //rawOrder = 1; + remapping = globalParams->getUnicodeRemapping(); + uBufSize = 16; + uBuf = (Unicode *)gmallocn(uBufSize, sizeof(Unicode)); // PL: to modify block order according to reading order if (parameters->getReadingOrder() == gTrue) { @@ -2649,7 +2653,7 @@ void TextPage::addCharToRawWord(GfxState *state, double x, double y, double dx, //cout << "addCharToRawWord" << endl; double x1, y1, w1, h1, dx2, dy2, base, sp, delta; GBool overlap; - int i; + int uBufLen, i; if (uLen == 0) { endWord(); @@ -2705,6 +2709,17 @@ void TextPage::addCharToRawWord(GfxState *state, double x, double y, double dx, return; } + // remap Unicode + uBufLen = 0; + for (i = 0; i < uLen; ++i) { + if (uBufSize - uBufLen < 8 && uBufSize < 20000) { + uBufSize *= 2; + uBuf = (Unicode *)greallocn(uBuf, uBufSize, sizeof(Unicode)); + } + uBufLen += remapping->map(u[i], uBuf + uBufLen, uBufSize - uBufLen); + } + u = uBuf; + uLen = uBufLen; if (!curWord) { beginWord(state, x, y); @@ -9242,7 +9257,8 @@ SplashFont *XmlAltoOutputDev::getSplashFont(GfxState *state, SplashCoord *matrix void XmlAltoOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, - Unicode *u, int uLen) { + Unicode *u, int uLen, + GBool fill, GBool stroke, GBool makePath) { GBool isNonUnicodeGlyph = gFalse; diff --git a/src/XmlAltoOutputDev.h b/src/XmlAltoOutputDev.h index e6feef6..0192143 100644 --- a/src/XmlAltoOutputDev.h +++ b/src/XmlAltoOutputDev.h @@ -67,6 +67,7 @@ class T3FontCache; struct T3FontCacheTag; class XmlAltoOutputDev; +class UnicodeRemapping; enum ModifierClass { NOT_A_MODIFIER, DIAERESIS, ACUTE_ACCENT, DOUBLE_ACUTE_ACCENT, GRAVE_ACCENT, DOUBLE_GRAVE_ACCENT, BREVE_ACCENT, INVERTED_BREVE_ACCENT, CIRCUMFLEX, TILDE, NORDIC_RING, CZECH_CARON, CEDILLA, DOT_ABOVE, HOOK, HORN, MACRON, OGONEK, @@ -1463,6 +1464,9 @@ class TextPage { /** if the page contains a column of line numbers */ bool lineNumber = false; + UnicodeRemapping *remapping; + Unicode *uBuf; + int uBufSize; // friend class TextBlock; // friend class TextColumn; @@ -1569,7 +1573,8 @@ class XmlAltoOutputDev: public OutputDev { virtual void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, - CharCode c, int nBytes, Unicode *u, int uLen); + CharCode c, int nBytes, Unicode *u, int uLen, + GBool fill, GBool stroke, GBool makePath); /** Save graphics state * @param state The state description */ diff --git a/src/pdfalto.cc b/src/pdfalto.cc index 0a22164..2a1d202 100644 --- a/src/pdfalto.cc +++ b/src/pdfalto.cc @@ -251,8 +251,9 @@ int main(int argc, char *argv[]) { cmd->append("-ocr "); //we avoid using heuristic mapping (not reliable) globalParams->setMapNumericCharNames(gFalse); - } else + } else { globalParams->setMapNumericCharNames(gTrue); + } if (fullFontName) { parameters->setFullFontName(gTrue); @@ -396,7 +397,7 @@ int main(int argc, char *argv[]) { dataDirName->append(NAME_DATA_DIR); removeAlreadyExistingData(dataDirName); - // Xml file to store annotations informations + // Xml file to store annotations information if (annots) { xmlDocPtr docAnnotXml; xmlNodePtr docroot; diff --git a/xpdf-4.03 b/xpdf-4.03 deleted file mode 160000 index d1d1fe2..0000000 --- a/xpdf-4.03 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d1d1fe28e267911a8e5a8b51b9ebde250a7d70e7 diff --git a/xpdf-4.05 b/xpdf-4.05 new file mode 160000 index 0000000..bcc6640 --- /dev/null +++ b/xpdf-4.05 @@ -0,0 +1 @@ +Subproject commit bcc6640ffa1211a801ba58826a88fb88159b2dc6 diff --git a/xpdfrc b/xpdfrc index 895c5de..0609810 100644 --- a/xpdfrc +++ b/xpdfrc @@ -59,3 +59,10 @@ unicodeMap TIS-620 languages/xpdf-thai/TIS-620.unicodeMap #----- begin Turkish support package (2011-aug-15) unicodeMap ISO-8859-9 languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package +#----- begin oldstyle support package (2024-dec-31) +nameToUnicode languages/xpdf-others/oldstyle.nameToUnicode +nameToUnicode languages/xpdf-others/taboldstyle.nameToUnicode +nameToUnicode languages/xpdf-others/ligatures.nameToUnicode +nameToUnicode languages/xpdf-others/fitted.nameToUnicode +unicodeRemapping languages/xpdf-others/symbols.unicodeRemapping +#----- end oldstyle support package