From 7fcfd50b7d46d26f7385ac094868ee32dea7390d Mon Sep 17 00:00:00 2001 From: davmarksman <David@DAVID> Date: Sun, 30 Jun 2024 08:53:23 +0100 Subject: [PATCH 1/3] Introduce IBlock and ILettersBlock interfaces --- .../Export/AltoXmlTextExporter.cs | 2 +- .../Export/HOcrTextExporter.cs | 2 +- .../Export/PageXmlGeneralExporter.cs | 306 ++++++++++++++++++ .../Export/PageXmlTextExporter.cs | 2 +- .../TextBlock.cs | 10 +- .../TextLine.cs | 8 +- .../WhitespaceCoverExtractor.cs | 2 +- .../SinglePageLibreOfficeImages.cs | 26 +- .../PublicApiScannerTests.cs | 2 + src/UglyToad.PdfPig.Tests/TestPdfImage.cs | 2 +- .../Writer/PdfDocumentBuilderTests.cs | 14 +- src/UglyToad.PdfPig/Content/IBlock.cs | 36 +++ src/UglyToad.PdfPig/Content/IPdfImage.cs | 7 +- src/UglyToad.PdfPig/Content/InlineImage.cs | 6 +- src/UglyToad.PdfPig/Content/Letter.cs | 23 +- src/UglyToad.PdfPig/Content/Word.cs | 2 +- src/UglyToad.PdfPig/XObjects/XObjectImage.cs | 6 +- 17 files changed, 411 insertions(+), 45 deletions(-) create mode 100644 src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlGeneralExporter.cs create mode 100644 src/UglyToad.PdfPig/Content/IBlock.cs diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs index c259f8338..7ec83ec8a 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs @@ -219,7 +219,7 @@ private AltoDocument.AltoGraphicalElement ToAltoGraphicalElement(PdfPath pdfPath private AltoDocument.AltoIllustration ToAltoIllustration(IPdfImage pdfImage, double height) { illustrationCount++; - var rectangle = pdfImage.Bounds; + var rectangle = pdfImage.BoundingBox; return new AltoDocument.AltoIllustration { diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs index 7b0fd33f7..22a434646 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs @@ -273,7 +273,7 @@ private string GetCode(PdfPath path, double pageHeight, bool subPaths, int level private string GetCode(IPdfImage pdfImage, double pageHeight, int level) { imageCount++; - var bbox = pdfImage.Bounds; + var bbox = pdfImage.BoundingBox; return GetIndent(level) + "<span class='ocr_image' id='image_" + pageCount + "_" + imageCount + "' title='" + GetCode(bbox, pageHeight) + "' />"; } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlGeneralExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlGeneralExporter.cs new file mode 100644 index 000000000..ece80985e --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlGeneralExporter.cs @@ -0,0 +1,306 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export +{ + using Content; + using Core; + using DocumentLayoutAnalysis; + using Graphics.Colors; + using PAGE; + using System; + using System.Collections.Generic; + using System.IO; + using System.Linq; + using System.Xml; + using System.Xml.Serialization; + + /// <summary> + /// PAGE-XML 2019-07-15 (XML) exporter for general case + /// This is a rewrite of <see cref="PageXmlTextExporter"/> to be simple and handle a general case of text, image + /// and custom implementer defined blocks + /// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para> + /// </summary> + public class PageXmlGeneralExporter + { + private readonly double scale; + private string indentChar; + private int nextId; + + /// <summary> + /// PAGE-XML 2019-07-15 (XML) exporter for general case + /// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para> + /// </summary> + /// <param name="scale"></param> + /// <param name="indent"></param> + public PageXmlGeneralExporter(double scale = 1.0, string indent = "\t") + { + this.scale = scale; + indentChar = indent; + } + + /// <summary> + /// Get the PAGE-XML (XML) string of the pages layout using the <see cref="IBlock"></see>'s as the page layout + /// </summary> + /// <param name="page">The Page</param> + /// <param name="blocks">Blocks to be exported</param> + /// <returns></returns> + public string Get(Page page, IEnumerable<IBlock> blocks) + { + PageXmlDocument pageXmlDocument = new PageXmlDocument() + { + Metadata = new PageXmlDocument.PageXmlMetadata() + { + Created = DateTime.UtcNow, + LastChange = DateTime.UtcNow, + Creator = "PdfPig", + Comments = "", + }, + PcGtsId = "pc-" + page.GetHashCode() + }; + + var xmlPage = CreatePage(page.Height, page.Width, blocks); + + pageXmlDocument.Page = xmlPage; + + return Serialize(pageXmlDocument); + } + + private PageXmlDocument.PageXmlPage CreatePage(double pageHeight, double pageWidth, IEnumerable<IBlock> blocks) + { + var pageXmlPage = new PageXmlDocument.PageXmlPage() + { + ImageFilename = "unknown", + ImageHeight = (int)Math.Round(pageHeight * scale), + ImageWidth = (int)Math.Round(pageWidth * scale), + }; + + var regions = blocks + .Select(b => ToRegion(b, pageWidth, pageHeight)) + .Where(x => x != null).ToList(); + pageXmlPage.Items = regions.ToArray(); + + var regionsOrder = regions.Select(x => x.Id); + + var orderedRegions = GetOrderRegions(regionsOrder).ToArray(); + pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder() + { + Item = new PageXmlDocument.PageXmlOrderedGroup() + { + Items = orderedRegions, + Id = "g" + NextId() + } + }; + + return pageXmlPage; + } + + private IEnumerable<PageXmlDocument.PageXmlRegionRefIndexed> GetOrderRegions(IEnumerable<string> idOrder) + { + var index = 1; + foreach (var item in idOrder) + { + yield return new PageXmlDocument.PageXmlRegionRefIndexed() + { + RegionRef = item, + Index = index++ + }; + } + } + + private PageXmlDocument.PageXmlRegion ToRegion(IBlock block, double pageWidth, double pageHeight) + { + if (block is TextBlock textblock) + { + return ToPageXmlTextRegion(textblock, pageWidth, pageHeight); + } + + if (block is ILettersBlock blockOfLetters) + { + return ToPageXmlSimpleTextRegion(blockOfLetters.BoundingBox, blockOfLetters.Text, pageWidth, pageHeight); + } + + if (block is IPdfImage imageBlock) + { + return ToImageRegion(imageBlock.BoundingBox, pageWidth, pageHeight); + } + + // Default case + return ToPageXmlSimpleTextRegion(block.BoundingBox, block.ToString(), pageWidth, pageHeight); + } + + private PageXmlDocument.PageXmlImageRegion ToImageRegion(PdfRectangle box, double pageWidth, double pageHeight) + { + return new PageXmlDocument.PageXmlImageRegion() + { + Coords = ToCoords(box, pageWidth, pageHeight), + Id = "r" + NextId(), + }; + } + + private PageXmlDocument.PageXmlTableRegion ToTableRegion(PdfRectangle box, double pageWidth, double pageHeight) + { + return new PageXmlDocument.PageXmlTableRegion() + { + Coords = ToCoords(box, pageWidth, pageHeight), + Id = "r" + NextId(), + }; + } + + private PageXmlDocument.PageXmlCustomRegion ToCustomRegion(PdfRectangle box, string text, double pageWidth, double pageHeight) + { + if (box.TopLeft.Equals(box.BottomRight)) + { + return null; + } + + return new PageXmlDocument.PageXmlCustomRegion() + { + Coords = ToCoords(box, pageWidth, pageHeight), + Id = "r" + NextId(), + Type = text + }; + } + + private PageXmlDocument.PageXmlTextRegion ToPageXmlSimpleTextRegion(PdfRectangle box, string text, double pageWidth, double pageHeight) + { + string regionId = "r" + NextId(); + + return new PageXmlDocument.PageXmlTextRegion() + { + Coords = ToCoords(box, pageWidth, pageHeight), + Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph, + TextLines = new PageXmlDocument.PageXmlTextLine[0], + TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = text } }, + Id = regionId + }; + } + + private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double pageWidth, double pageHeight) + { + string regionId = "r" + NextId(); + + + return new PageXmlDocument.PageXmlTextRegion() + { + Coords = ToCoords(textBlock.BoundingBox, pageWidth, pageHeight), + Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph, + TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, pageWidth, pageHeight)).ToArray(), + TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } }, + Id = regionId + }; + } + + private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double pageWidth, double pageHeight) + { + return new PageXmlDocument.PageXmlTextLine() + { + Coords = ToCoords(textLine.BoundingBox, pageWidth, pageHeight), + Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, + Words = textLine.Words.Select(w => ToPageXmlWord(w, pageWidth, pageHeight)).ToArray(), + TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } }, + Id = "l" + NextId() + }; + } + + private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double pageWidth, double pageHeight) + { + return new PageXmlDocument.PageXmlWord() + { + Coords = ToCoords(word.BoundingBox, pageWidth, pageHeight), + Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, pageWidth, pageHeight)).ToArray(), + TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = word.Text } }, + Id = "w" + NextId() + }; + } + + private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWidth, double pageHeight) + { + return new PageXmlDocument.PageXmlGlyph() + { + Coords = ToCoords(letter.GlyphRectangle, pageWidth, pageHeight), + Ligature = false, + Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, + TextStyle = new PageXmlDocument.PageXmlTextStyle() + { + FontSize = (float)letter.FontSize, + FontFamily = letter.FontName, + TextColourRgb = ToRgbEncoded(letter.Color), + }, + TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = letter.Value } }, + Id = "c" + NextId() + }; + } + + private string PointToString(PdfPoint point, double pageWidth, double pageHeight) + { + double x = Math.Round(point.X * scale); + double y = Math.Round((pageHeight - point.Y) * scale); + + // move away from borders + x = x > 1 ? x : 1; + y = y > 1 ? y : 1; + + x = x < pageWidth - 1 ? x : pageWidth - 1; + y = y < pageHeight - 1 ? y : pageHeight - 1; + + return x.ToString("0") + "," + y.ToString("0"); + } + + private string ToPoints(IEnumerable<PdfPoint> points, double pageWidth, double pageHeight) + { + return string.Join(" ", points.Select(p => PointToString(p, pageWidth, pageHeight))); + } + + private string ToPoints(PdfRectangle pdfRectangle, double pageWidth, double pageHeight) + { + return ToPoints( + new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, + pageWidth, pageHeight); + } + + private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double pageWidth, double pageHeight) + { + return new PageXmlDocument.PageXmlCoords() + { + Points = ToPoints(pdfRectangle, pageWidth, pageHeight) + }; + } + + /// <summary> + /// PageXml Text colour in RGB encoded format + /// <para>(red value) + (256 x green value) + (65536 x blue value).</para> + /// </summary> + private string ToRgbEncoded(IColor color) + { + var rgb = color.ToRGBValues(); + int red = (int)Math.Round(255f * (float)rgb.r); + int green = 256 * (int)Math.Round(255f * (float)rgb.g); + int blue = 65536 * (int)Math.Round(255f * (float)rgb.b); + int sum = red + green + blue; + + // as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum); + return sum.ToString(); + } + + private string Serialize(PageXmlDocument pageXmlDocument) + { + XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument)); + var settings = new XmlWriterSettings() + { + Encoding = System.Text.Encoding.UTF8, + Indent = true, + IndentChars = indentChar, + }; + + using (var memoryStream = new MemoryStream()) + using (var xmlWriter = XmlWriter.Create(memoryStream, settings)) + { + serializer.Serialize(xmlWriter, pageXmlDocument); + return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray()); + } + } + + private int NextId() + { + return nextId++; + } + } +} diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs index 47337bba3..c1d6f44f8 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs @@ -273,7 +273,7 @@ private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfP private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, PageXmlData data, double pageWidth, double pageHeight) { data.RegionsCount++; - var bbox = pdfImage.Bounds; + var bbox = pdfImage.BoundingBox; return new PageXmlDocument.PageXmlImageRegion() { Coords = ToCoords(bbox, pageWidth, pageHeight), diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs index eb7bf75be..80b4dfa6a 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs @@ -9,8 +9,13 @@ /// <summary> /// A block of text. /// </summary> - public class TextBlock - { + public class TextBlock: ILettersBlock + { + /// <summary> + /// The letters contained in this TextBlock + /// </summary> + public IReadOnlyList<Letter> Letters { get; } + /// <summary> /// The separator used between lines in the block. /// </summary> @@ -63,6 +68,7 @@ public TextBlock(IReadOnlyList<TextLine> lines, string separator = "\n") ReadingOrder = -1; TextLines = lines; + Letters = lines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).ToList().AsReadOnly(); if (lines.Count == 1) { diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs index 322780f3d..873c062c0 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs @@ -9,8 +9,13 @@ /// <summary> /// A line of text. /// </summary> - public class TextLine + public class TextLine : ILettersBlock { + /// <summary> + /// The letters contained in this TextLine + /// </summary> + public IReadOnlyList<Letter> Letters { get; } + /// <summary> /// The separator used between words in the line. /// </summary> @@ -56,6 +61,7 @@ public TextLine(IReadOnlyList<Word> words, string separator = " ") Separator = separator; Words = words; + Letters = words.SelectMany(w => w.Letters).ToList().AsReadOnly(); if (Words.Count == 1) { diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs index 395780ede..68be72a8e 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs @@ -51,7 +51,7 @@ public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<Word> words if (images?.Any() == true) { - bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds)); + bboxes.AddRange(images.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0).Select(o => o.BoundingBox)); } return GetWhitespaces(bboxes, diff --git a/src/UglyToad.PdfPig.Tests/Integration/SinglePageLibreOfficeImages.cs b/src/UglyToad.PdfPig.Tests/Integration/SinglePageLibreOfficeImages.cs index 22e6e5eb0..de53fe2d2 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/SinglePageLibreOfficeImages.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/SinglePageLibreOfficeImages.cs @@ -26,29 +26,29 @@ public void ImagesHaveCorrectDimensionsAndLocations() { var page = document.GetPage(1); - var images = page.GetImages().OrderBy(x => x.Bounds.Width).ToList(); + var images = page.GetImages().OrderBy(x => x.BoundingBox.Width).ToList(); var pdfPigSquare = images[0]; - Assert.Equal(148.3d, pdfPigSquare.Bounds.Width, doubleComparer); - Assert.Equal(148.3d, pdfPigSquare.Bounds.Height, doubleComparer); - Assert.Equal(60.1d, pdfPigSquare.Bounds.Left, doubleComparer); - Assert.Equal(765.8d, pdfPigSquare.Bounds.Top, doubleComparer); + Assert.Equal(148.3d, pdfPigSquare.BoundingBox.Width, doubleComparer); + Assert.Equal(148.3d, pdfPigSquare.BoundingBox.Height, doubleComparer); + Assert.Equal(60.1d, pdfPigSquare.BoundingBox.Left, doubleComparer); + Assert.Equal(765.8d, pdfPigSquare.BoundingBox.Top, doubleComparer); var pdfPigSquished = images[1]; - Assert.Equal(206.8d, pdfPigSquished.Bounds.Width, doubleComparer); - Assert.Equal(83.2d, pdfPigSquished.Bounds.Height, doubleComparer); - Assert.Equal(309.8d, pdfPigSquished.Bounds.Left, doubleComparer); - Assert.Equal(552.1d, pdfPigSquished.Bounds.Top, doubleComparer); + Assert.Equal(206.8d, pdfPigSquished.BoundingBox.Width, doubleComparer); + Assert.Equal(83.2d, pdfPigSquished.BoundingBox.Height, doubleComparer); + Assert.Equal(309.8d, pdfPigSquished.BoundingBox.Left, doubleComparer); + Assert.Equal(552.1d, pdfPigSquished.BoundingBox.Top, doubleComparer); var birthdayPigs = images[2]; - Assert.Equal(391d, birthdayPigs.Bounds.Width, doubleComparer); - Assert.Equal(267.1d, birthdayPigs.Bounds.Height, doubleComparer); - Assert.Equal(102.2d, birthdayPigs.Bounds.Left, doubleComparer); - Assert.Equal(426.3d, birthdayPigs.Bounds.Top, doubleComparer); + Assert.Equal(391d, birthdayPigs.BoundingBox.Width, doubleComparer); + Assert.Equal(267.1d, birthdayPigs.BoundingBox.Height, doubleComparer); + Assert.Equal(102.2d, birthdayPigs.BoundingBox.Left, doubleComparer); + Assert.Equal(426.3d, birthdayPigs.BoundingBox.Top, doubleComparer); } } diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index dc387e6e8..4195acd58 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -77,6 +77,8 @@ public void OnlyExposedApiIsPublic() "UglyToad.PdfPig.Content.DocumentInformation", "UglyToad.PdfPig.Content.EmbeddedFile", "UglyToad.PdfPig.Content.Hyperlink", + "UglyToad.PdfPig.Content.IBlock", + "UglyToad.PdfPig.Content.ILettersBlock", "UglyToad.PdfPig.Content.InlineImage", "UglyToad.PdfPig.Content.IPageFactory`1", "UglyToad.PdfPig.Content.IPdfImage", diff --git a/src/UglyToad.PdfPig.Tests/TestPdfImage.cs b/src/UglyToad.PdfPig.Tests/TestPdfImage.cs index aba9c7268..5878310be 100644 --- a/src/UglyToad.PdfPig.Tests/TestPdfImage.cs +++ b/src/UglyToad.PdfPig.Tests/TestPdfImage.cs @@ -9,7 +9,7 @@ public class TestPdfImage : IPdfImage { - public PdfRectangle Bounds { get; set; } + public PdfRectangle BoundingBox { get; set; } public int WidthInSamples { get; set; } diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs index 9ddd9683e..6186a44b8 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs @@ -563,8 +563,8 @@ public void CanWriteSinglePageWithJpeg() Assert.NotNull(image); - Assert.Equal(expectedBounds.BottomLeft, image.Bounds.BottomLeft); - Assert.Equal(expectedBounds.TopRight, image.Bounds.TopRight); + Assert.Equal(expectedBounds.BottomLeft, image.BoundingBox.BottomLeft); + Assert.Equal(expectedBounds.TopRight, image.BoundingBox.TopRight); Assert.Equal(imageBytes, image.RawMemory.ToArray()); } @@ -609,10 +609,10 @@ public void CanWrite2PagesSharingJpeg() Assert.Equal(2, page1Images.Count); var image1 = page1Images[0]; - Assert.Equal(expectedBounds1, image1.Bounds); + Assert.Equal(expectedBounds1, image1.BoundingBox); var image2 = page1Images[1]; - Assert.Equal(expectedBounds2, image2.Bounds); + Assert.Equal(expectedBounds2, image2.BoundingBox); var page2Doc = document.GetPage(2); @@ -620,7 +620,7 @@ public void CanWrite2PagesSharingJpeg() Assert.NotNull(image3); - Assert.Equal(expectedBounds3, image3.Bounds); + Assert.Equal(expectedBounds3, image3.BoundingBox); Assert.Equal(imageBytes, image1.RawMemory.ToArray()); Assert.Equal(imageBytes, image2.RawMemory.ToArray()); @@ -696,8 +696,8 @@ public void CanWriteSinglePageWithPng() Assert.NotNull(image); - Assert.Equal(expectedBounds.BottomLeft, image.Bounds.BottomLeft); - Assert.Equal(expectedBounds.TopRight, image.Bounds.TopRight); + Assert.Equal(expectedBounds.BottomLeft, image.BoundingBox.BottomLeft); + Assert.Equal(expectedBounds.TopRight, image.BoundingBox.TopRight); Assert.True(image.TryGetPng(out var png)); Assert.NotNull(png); diff --git a/src/UglyToad.PdfPig/Content/IBlock.cs b/src/UglyToad.PdfPig/Content/IBlock.cs new file mode 100644 index 000000000..6db5a5334 --- /dev/null +++ b/src/UglyToad.PdfPig/Content/IBlock.cs @@ -0,0 +1,36 @@ +namespace UglyToad.PdfPig.Content +{ + using UglyToad.PdfPig.Core; + + /// <summary> + /// Interface for classes with a bounding box + /// </summary> + public interface IBlock + { + /// <summary> + /// Gets the Bounding Box: The rectangle completely containing this object + /// </summary> + PdfRectangle BoundingBox { get; } + } + + /// <summary> + /// Interface for classes with a bounding box and text + /// </summary> + public interface ILettersBlock : IBlock + { + /// <summary> + /// The text of the block + /// </summary> + string Text { get; } + + /// <summary> + /// Text orientation of the block. + /// </summary> + TextOrientation TextOrientation { get; } + + /// <summary> + /// The letters contained in the Block + /// </summary> + IReadOnlyList<Letter> Letters { get; } + } +} diff --git a/src/UglyToad.PdfPig/Content/IPdfImage.cs b/src/UglyToad.PdfPig/Content/IPdfImage.cs index 544ea3e76..29056158a 100644 --- a/src/UglyToad.PdfPig/Content/IPdfImage.cs +++ b/src/UglyToad.PdfPig/Content/IPdfImage.cs @@ -12,13 +12,8 @@ /// <summary> /// An image in a PDF document, may be an <see cref="InlineImage"/> or a PostScript image XObject (<see cref="XObjectImage"/>). /// </summary> - public interface IPdfImage + public interface IPdfImage : IBlock { - /// <summary> - /// The placement rectangle of the image in PDF coordinates. - /// </summary> - PdfRectangle Bounds { get; } - /// <summary> /// The width of the image in samples. /// </summary> diff --git a/src/UglyToad.PdfPig/Content/InlineImage.cs b/src/UglyToad.PdfPig/Content/InlineImage.cs index 7c8c0f5ec..54d467722 100644 --- a/src/UglyToad.PdfPig/Content/InlineImage.cs +++ b/src/UglyToad.PdfPig/Content/InlineImage.cs @@ -19,7 +19,7 @@ public class InlineImage : IPdfImage private readonly Lazy<ReadOnlyMemory<byte>>? memoryFactory; /// <inheritdoc /> - public PdfRectangle Bounds { get; } + public PdfRectangle BoundingBox { get; } /// <inheritdoc /> public int WidthInSamples { get; } @@ -69,7 +69,7 @@ internal InlineImage(PdfRectangle bounds, int widthInSamples, int heightInSample DictionaryToken streamDictionary, ColorSpaceDetails colorSpaceDetails) { - Bounds = bounds; + BoundingBox = bounds; WidthInSamples = widthInSamples; HeightInSamples = heightInSamples; Decode = decode; @@ -124,7 +124,7 @@ public bool TryGetBytesAsMemory(out ReadOnlyMemory<byte> bytes) /// <inheritdoc /> public override string ToString() { - return $"Inline Image (w {Bounds.Width}, h {Bounds.Height})"; + return $"Inline Image (w {BoundingBox.Width}, h {BoundingBox.Height})"; } } } diff --git a/src/UglyToad.PdfPig/Content/Letter.cs b/src/UglyToad.PdfPig/Content/Letter.cs index 3ee0222f7..1156e795c 100644 --- a/src/UglyToad.PdfPig/Content/Letter.cs +++ b/src/UglyToad.PdfPig/Content/Letter.cs @@ -7,12 +7,22 @@ /// <summary> /// A glyph or combination of glyphs (characters) drawn by a PDF content stream. /// </summary> - public class Letter - { + public class Letter : ILettersBlock + { + /// <summary> + /// This letter as as List of Letters in order to implement ILettersBlock interface + /// </summary> + public IReadOnlyList<Letter> Letters => [this]; + /// <summary> /// The text for this letter or unicode character. /// </summary> - public string Value { get; } + public string Value { get; } + + /// <summary> + /// The text of the this letter or unicode character. Same as <see cref="Letter.Value"/> + /// </summary> + public string Text => Value; /// <summary> /// Text orientation of the letter. @@ -44,7 +54,12 @@ public class Letter /// For example letters with descenders, p, j, etc., will have a box extending below the <see cref="Location"/> they are placed at. /// The width of the glyph may also be more or less than the <see cref="Width"/> allocated for the character in the PDF content. /// </summary> - public PdfRectangle GlyphRectangle { get; } + public PdfRectangle GlyphRectangle { get; } + + /// <summary> + /// Gets the Bounding Box: The rectangle completely containing this object. Same as <see cref="GlyphRectangle"/> + /// </summary> + public PdfRectangle BoundingBox => GlyphRectangle; /// <summary> /// Size as defined in the PDF file. This is not equivalent to font size in points but is relative to other font sizes on the page. diff --git a/src/UglyToad.PdfPig/Content/Word.cs b/src/UglyToad.PdfPig/Content/Word.cs index 7d157fba8..7d96c8aa9 100644 --- a/src/UglyToad.PdfPig/Content/Word.cs +++ b/src/UglyToad.PdfPig/Content/Word.cs @@ -9,7 +9,7 @@ /// <summary> /// A word. /// </summary> - public class Word + public class Word : ILettersBlock { /// <summary> /// The text of the word. diff --git a/src/UglyToad.PdfPig/XObjects/XObjectImage.cs b/src/UglyToad.PdfPig/XObjects/XObjectImage.cs index 09ebf4e79..dda187e74 100644 --- a/src/UglyToad.PdfPig/XObjects/XObjectImage.cs +++ b/src/UglyToad.PdfPig/XObjects/XObjectImage.cs @@ -19,7 +19,7 @@ public class XObjectImage : IPdfImage private readonly Lazy<ReadOnlyMemory<byte>>? memoryFactory; /// <inheritdoc /> - public PdfRectangle Bounds { get; } + public PdfRectangle BoundingBox { get; } /// <inheritdoc /> public int WidthInSamples { get; } @@ -81,7 +81,7 @@ internal XObjectImage(PdfRectangle bounds, Lazy<ReadOnlyMemory<byte>>? bytes, ColorSpaceDetails? colorSpaceDetails) { - Bounds = bounds; + BoundingBox = bounds; WidthInSamples = widthInSamples; HeightInSamples = heightInSamples; BitsPerComponent = bitsPerComponent; @@ -116,7 +116,7 @@ public bool TryGetBytesAsMemory(out ReadOnlyMemory<byte> bytes) /// <inheritdoc /> public override string ToString() { - return $"XObject Image (w {Bounds.Width}, h {Bounds.Height}): {ImageDictionary}"; + return $"XObject Image (w {BoundingBox.Width}, h {BoundingBox.Height}): {ImageDictionary}"; } } } From 4712169a13e4071ceb109e391a706a9bc5aaeeec Mon Sep 17 00:00:00 2001 From: davmarksman <David@DAVID> Date: Sun, 30 Jun 2024 11:40:54 +0100 Subject: [PATCH 2/3] Changes from code review --- .../Export/PageXmlGeneralExporter.cs | 306 ------------------ .../Content/{IBlock.cs => IBoundingBox.cs} | 4 +- src/UglyToad.PdfPig/Content/IPdfImage.cs | 2 +- src/UglyToad.PdfPig/Content/Letter.cs | 12 +- 4 files changed, 4 insertions(+), 320 deletions(-) delete mode 100644 src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlGeneralExporter.cs rename src/UglyToad.PdfPig/Content/{IBlock.cs => IBoundingBox.cs} (90%) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlGeneralExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlGeneralExporter.cs deleted file mode 100644 index ece80985e..000000000 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlGeneralExporter.cs +++ /dev/null @@ -1,306 +0,0 @@ -namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export -{ - using Content; - using Core; - using DocumentLayoutAnalysis; - using Graphics.Colors; - using PAGE; - using System; - using System.Collections.Generic; - using System.IO; - using System.Linq; - using System.Xml; - using System.Xml.Serialization; - - /// <summary> - /// PAGE-XML 2019-07-15 (XML) exporter for general case - /// This is a rewrite of <see cref="PageXmlTextExporter"/> to be simple and handle a general case of text, image - /// and custom implementer defined blocks - /// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para> - /// </summary> - public class PageXmlGeneralExporter - { - private readonly double scale; - private string indentChar; - private int nextId; - - /// <summary> - /// PAGE-XML 2019-07-15 (XML) exporter for general case - /// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para> - /// </summary> - /// <param name="scale"></param> - /// <param name="indent"></param> - public PageXmlGeneralExporter(double scale = 1.0, string indent = "\t") - { - this.scale = scale; - indentChar = indent; - } - - /// <summary> - /// Get the PAGE-XML (XML) string of the pages layout using the <see cref="IBlock"></see>'s as the page layout - /// </summary> - /// <param name="page">The Page</param> - /// <param name="blocks">Blocks to be exported</param> - /// <returns></returns> - public string Get(Page page, IEnumerable<IBlock> blocks) - { - PageXmlDocument pageXmlDocument = new PageXmlDocument() - { - Metadata = new PageXmlDocument.PageXmlMetadata() - { - Created = DateTime.UtcNow, - LastChange = DateTime.UtcNow, - Creator = "PdfPig", - Comments = "", - }, - PcGtsId = "pc-" + page.GetHashCode() - }; - - var xmlPage = CreatePage(page.Height, page.Width, blocks); - - pageXmlDocument.Page = xmlPage; - - return Serialize(pageXmlDocument); - } - - private PageXmlDocument.PageXmlPage CreatePage(double pageHeight, double pageWidth, IEnumerable<IBlock> blocks) - { - var pageXmlPage = new PageXmlDocument.PageXmlPage() - { - ImageFilename = "unknown", - ImageHeight = (int)Math.Round(pageHeight * scale), - ImageWidth = (int)Math.Round(pageWidth * scale), - }; - - var regions = blocks - .Select(b => ToRegion(b, pageWidth, pageHeight)) - .Where(x => x != null).ToList(); - pageXmlPage.Items = regions.ToArray(); - - var regionsOrder = regions.Select(x => x.Id); - - var orderedRegions = GetOrderRegions(regionsOrder).ToArray(); - pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder() - { - Item = new PageXmlDocument.PageXmlOrderedGroup() - { - Items = orderedRegions, - Id = "g" + NextId() - } - }; - - return pageXmlPage; - } - - private IEnumerable<PageXmlDocument.PageXmlRegionRefIndexed> GetOrderRegions(IEnumerable<string> idOrder) - { - var index = 1; - foreach (var item in idOrder) - { - yield return new PageXmlDocument.PageXmlRegionRefIndexed() - { - RegionRef = item, - Index = index++ - }; - } - } - - private PageXmlDocument.PageXmlRegion ToRegion(IBlock block, double pageWidth, double pageHeight) - { - if (block is TextBlock textblock) - { - return ToPageXmlTextRegion(textblock, pageWidth, pageHeight); - } - - if (block is ILettersBlock blockOfLetters) - { - return ToPageXmlSimpleTextRegion(blockOfLetters.BoundingBox, blockOfLetters.Text, pageWidth, pageHeight); - } - - if (block is IPdfImage imageBlock) - { - return ToImageRegion(imageBlock.BoundingBox, pageWidth, pageHeight); - } - - // Default case - return ToPageXmlSimpleTextRegion(block.BoundingBox, block.ToString(), pageWidth, pageHeight); - } - - private PageXmlDocument.PageXmlImageRegion ToImageRegion(PdfRectangle box, double pageWidth, double pageHeight) - { - return new PageXmlDocument.PageXmlImageRegion() - { - Coords = ToCoords(box, pageWidth, pageHeight), - Id = "r" + NextId(), - }; - } - - private PageXmlDocument.PageXmlTableRegion ToTableRegion(PdfRectangle box, double pageWidth, double pageHeight) - { - return new PageXmlDocument.PageXmlTableRegion() - { - Coords = ToCoords(box, pageWidth, pageHeight), - Id = "r" + NextId(), - }; - } - - private PageXmlDocument.PageXmlCustomRegion ToCustomRegion(PdfRectangle box, string text, double pageWidth, double pageHeight) - { - if (box.TopLeft.Equals(box.BottomRight)) - { - return null; - } - - return new PageXmlDocument.PageXmlCustomRegion() - { - Coords = ToCoords(box, pageWidth, pageHeight), - Id = "r" + NextId(), - Type = text - }; - } - - private PageXmlDocument.PageXmlTextRegion ToPageXmlSimpleTextRegion(PdfRectangle box, string text, double pageWidth, double pageHeight) - { - string regionId = "r" + NextId(); - - return new PageXmlDocument.PageXmlTextRegion() - { - Coords = ToCoords(box, pageWidth, pageHeight), - Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph, - TextLines = new PageXmlDocument.PageXmlTextLine[0], - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = text } }, - Id = regionId - }; - } - - private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double pageWidth, double pageHeight) - { - string regionId = "r" + NextId(); - - - return new PageXmlDocument.PageXmlTextRegion() - { - Coords = ToCoords(textBlock.BoundingBox, pageWidth, pageHeight), - Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph, - TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, pageWidth, pageHeight)).ToArray(), - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } }, - Id = regionId - }; - } - - private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double pageWidth, double pageHeight) - { - return new PageXmlDocument.PageXmlTextLine() - { - Coords = ToCoords(textLine.BoundingBox, pageWidth, pageHeight), - Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, - Words = textLine.Words.Select(w => ToPageXmlWord(w, pageWidth, pageHeight)).ToArray(), - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } }, - Id = "l" + NextId() - }; - } - - private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double pageWidth, double pageHeight) - { - return new PageXmlDocument.PageXmlWord() - { - Coords = ToCoords(word.BoundingBox, pageWidth, pageHeight), - Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, pageWidth, pageHeight)).ToArray(), - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = word.Text } }, - Id = "w" + NextId() - }; - } - - private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWidth, double pageHeight) - { - return new PageXmlDocument.PageXmlGlyph() - { - Coords = ToCoords(letter.GlyphRectangle, pageWidth, pageHeight), - Ligature = false, - Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, - TextStyle = new PageXmlDocument.PageXmlTextStyle() - { - FontSize = (float)letter.FontSize, - FontFamily = letter.FontName, - TextColourRgb = ToRgbEncoded(letter.Color), - }, - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = letter.Value } }, - Id = "c" + NextId() - }; - } - - private string PointToString(PdfPoint point, double pageWidth, double pageHeight) - { - double x = Math.Round(point.X * scale); - double y = Math.Round((pageHeight - point.Y) * scale); - - // move away from borders - x = x > 1 ? x : 1; - y = y > 1 ? y : 1; - - x = x < pageWidth - 1 ? x : pageWidth - 1; - y = y < pageHeight - 1 ? y : pageHeight - 1; - - return x.ToString("0") + "," + y.ToString("0"); - } - - private string ToPoints(IEnumerable<PdfPoint> points, double pageWidth, double pageHeight) - { - return string.Join(" ", points.Select(p => PointToString(p, pageWidth, pageHeight))); - } - - private string ToPoints(PdfRectangle pdfRectangle, double pageWidth, double pageHeight) - { - return ToPoints( - new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, - pageWidth, pageHeight); - } - - private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double pageWidth, double pageHeight) - { - return new PageXmlDocument.PageXmlCoords() - { - Points = ToPoints(pdfRectangle, pageWidth, pageHeight) - }; - } - - /// <summary> - /// PageXml Text colour in RGB encoded format - /// <para>(red value) + (256 x green value) + (65536 x blue value).</para> - /// </summary> - private string ToRgbEncoded(IColor color) - { - var rgb = color.ToRGBValues(); - int red = (int)Math.Round(255f * (float)rgb.r); - int green = 256 * (int)Math.Round(255f * (float)rgb.g); - int blue = 65536 * (int)Math.Round(255f * (float)rgb.b); - int sum = red + green + blue; - - // as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum); - return sum.ToString(); - } - - private string Serialize(PageXmlDocument pageXmlDocument) - { - XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument)); - var settings = new XmlWriterSettings() - { - Encoding = System.Text.Encoding.UTF8, - Indent = true, - IndentChars = indentChar, - }; - - using (var memoryStream = new MemoryStream()) - using (var xmlWriter = XmlWriter.Create(memoryStream, settings)) - { - serializer.Serialize(xmlWriter, pageXmlDocument); - return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray()); - } - } - - private int NextId() - { - return nextId++; - } - } -} diff --git a/src/UglyToad.PdfPig/Content/IBlock.cs b/src/UglyToad.PdfPig/Content/IBoundingBox.cs similarity index 90% rename from src/UglyToad.PdfPig/Content/IBlock.cs rename to src/UglyToad.PdfPig/Content/IBoundingBox.cs index 6db5a5334..2abc3da4a 100644 --- a/src/UglyToad.PdfPig/Content/IBlock.cs +++ b/src/UglyToad.PdfPig/Content/IBoundingBox.cs @@ -5,7 +5,7 @@ /// <summary> /// Interface for classes with a bounding box /// </summary> - public interface IBlock + public interface IBoundingBox { /// <summary> /// Gets the Bounding Box: The rectangle completely containing this object @@ -16,7 +16,7 @@ public interface IBlock /// <summary> /// Interface for classes with a bounding box and text /// </summary> - public interface ILettersBlock : IBlock + public interface ILettersBlock : IBoundingBox { /// <summary> /// The text of the block diff --git a/src/UglyToad.PdfPig/Content/IPdfImage.cs b/src/UglyToad.PdfPig/Content/IPdfImage.cs index 29056158a..6116f45bc 100644 --- a/src/UglyToad.PdfPig/Content/IPdfImage.cs +++ b/src/UglyToad.PdfPig/Content/IPdfImage.cs @@ -12,7 +12,7 @@ /// <summary> /// An image in a PDF document, may be an <see cref="InlineImage"/> or a PostScript image XObject (<see cref="XObjectImage"/>). /// </summary> - public interface IPdfImage : IBlock + public interface IPdfImage : IBoundingBox { /// <summary> /// The width of the image in samples. diff --git a/src/UglyToad.PdfPig/Content/Letter.cs b/src/UglyToad.PdfPig/Content/Letter.cs index 1156e795c..9cb32730b 100644 --- a/src/UglyToad.PdfPig/Content/Letter.cs +++ b/src/UglyToad.PdfPig/Content/Letter.cs @@ -7,22 +7,12 @@ /// <summary> /// A glyph or combination of glyphs (characters) drawn by a PDF content stream. /// </summary> - public class Letter : ILettersBlock + public class Letter : IBoundingBox { - /// <summary> - /// This letter as as List of Letters in order to implement ILettersBlock interface - /// </summary> - public IReadOnlyList<Letter> Letters => [this]; - /// <summary> /// The text for this letter or unicode character. /// </summary> public string Value { get; } - - /// <summary> - /// The text of the this letter or unicode character. Same as <see cref="Letter.Value"/> - /// </summary> - public string Text => Value; /// <summary> /// Text orientation of the letter. From a5a6a0d58f9c771699c850d203fce6ed431dbb63 Mon Sep 17 00:00:00 2001 From: davmarksman <David@DAVID> Date: Sun, 30 Jun 2024 11:41:48 +0100 Subject: [PATCH 3/3] Fix tests --- src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 4195acd58..affe13ee1 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -77,7 +77,7 @@ public void OnlyExposedApiIsPublic() "UglyToad.PdfPig.Content.DocumentInformation", "UglyToad.PdfPig.Content.EmbeddedFile", "UglyToad.PdfPig.Content.Hyperlink", - "UglyToad.PdfPig.Content.IBlock", + "UglyToad.PdfPig.Content.IBoundingBox", "UglyToad.PdfPig.Content.ILettersBlock", "UglyToad.PdfPig.Content.InlineImage", "UglyToad.PdfPig.Content.IPageFactory`1",