From e4219d94fd768b62c26664e120abeeefa57dea32 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Mon, 8 Jul 2024 13:40:04 -0400 Subject: [PATCH 1/2] Remove pretty-printing Whitespace behavior is different between Html::html and this half-working pretty printer. Now the tests match the parser output exactly. Signed-off-by: Evan Lloyd New-Schmidt --- src/html.rs | 2 - src/html/pretty.rs | 128 ----------------------------- tests/data/Q4185820-en/output.html | 101 +++-------------------- tests/data/Q748282-en/output.html | 111 ++++--------------------- tests/html.rs | 4 +- 5 files changed, 32 insertions(+), 314 deletions(-) delete mode 100644 src/html/pretty.rs diff --git a/src/html.rs b/src/html.rs index a1b6c89..d196082 100644 --- a/src/html.rs +++ b/src/html.rs @@ -23,8 +23,6 @@ use once_cell::sync::Lazy; use scraper::{ElementRef, Html, Node, Selector}; use serde::Deserialize; -mod pretty; -pub use pretty::pretty_print; use url::Url; #[derive(Debug, Deserialize)] diff --git a/src/html/pretty.rs b/src/html/pretty.rs deleted file mode 100644 index 903ea7c..0000000 --- a/src/html/pretty.rs +++ /dev/null @@ -1,128 +0,0 @@ -// Based on the implementation from `htmlq`: https://github.com/mgdm/htmlq/blob/6e31bc814332b2521f0316d0ed9bf0a1c521b6e6/src/pretty_print.rs -// Available under the MIT License. -// Copyright (c) 2019 Michael Maclean - -use std::{ - collections::HashSet, - io::{self, Write}, - str, -}; - -use html5ever::{ - serialize::{HtmlSerializer, Serialize, SerializeOpts, Serializer, TraversalScope}, - QualName, -}; - -use markup5ever::serialize::AttrRef; -use once_cell::sync::Lazy; -use scraper::Html; - -pub fn pretty_print(html: &Html) -> String { - let mut content: Vec = Vec::new(); - let mut pp = PrettyPrint { - indent: 0, - previous_was_block: false, - inner: HtmlSerializer::new( - &mut content, - SerializeOpts { - traversal_scope: TraversalScope::IncludeNode, - ..Default::default() - }, - ), - at_beginning: true, - }; - Serialize::serialize(html, &mut pp, TraversalScope::IncludeNode).unwrap(); - str::from_utf8(content.as_ref()).unwrap().to_owned() -} - -/// Elements to print on a single line instead of expanded. -static INLINE_ELEMENTS: Lazy> = Lazy::new(|| { - vec![ - "a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "button", "canvas", "cite", - "code", "data", "datalist", "del", "dfn", "em", "embed", "i", "iframe", "img", "input", - "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output", "picture", - "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small", "span", - "strong", "sub", "sup", "svg", "template", "textarea", "time", "u", "tt", "var", "video", - "wbr", - ] - .into_iter() - .collect() -}); - -fn is_inline(name: &str) -> bool { - INLINE_ELEMENTS.contains(name) -} - -struct PrettyPrint { - indent: usize, - previous_was_block: bool, - inner: HtmlSerializer, - at_beginning: bool, -} - -impl Serializer for PrettyPrint { - fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()> - where - AttrIter: Iterator>, - { - // Make attribute order deterministic. - let mut attrs: Vec<_> = attrs.collect(); - attrs.sort(); - - let inline = is_inline(&name.local); - if (!inline || self.previous_was_block) && !self.at_beginning { - self.inner.writer.write_all(b"\n")?; - self.inner.writer.write_all(&vec![b' '; self.indent])?; - } - - self.indent += 2; - self.inner.start_elem(name, attrs.into_iter())?; - - if self.at_beginning { - self.at_beginning = false; - self.previous_was_block = !inline; - } - - Ok(()) - } - - fn end_elem(&mut self, name: QualName) -> io::Result<()> { - self.indent -= 2; - - if is_inline(&name.local) { - self.previous_was_block = false; - } else { - self.inner.writer.write_all(b"\n")?; - self.inner.writer.write_all(&vec![b' '; self.indent])?; - self.previous_was_block = true; - } - - self.inner.end_elem(name) - } - - fn write_text(&mut self, text: &str) -> io::Result<()> { - if text.trim().is_empty() { - Ok(()) - } else { - if self.previous_was_block { - self.inner.writer.write_all(b"\n")?; - self.inner.writer.write_all(&vec![b' '; self.indent])?; - } - - self.previous_was_block = false; - self.inner.write_text(text) - } - } - - fn write_comment(&mut self, text: &str) -> io::Result<()> { - self.inner.write_comment(text) - } - - fn write_doctype(&mut self, name: &str) -> io::Result<()> { - self.inner.write_doctype(name) - } - - fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> { - self.inner.write_processing_instruction(target, data) - } -} diff --git a/tests/data/Q4185820-en/output.html b/tests/data/Q4185820-en/output.html index 9f996d3..df96449 100644 --- a/tests/data/Q4185820-en/output.html +++ b/tests/data/Q4185820-en/output.html @@ -1,90 +1,13 @@ -

- - Thoor Ballylee Castle (IrishTúr Bhaile Uí Laí) is a fortified, 15th-century Anglo-Normantower house built by the septsde Burgo, or Burke, near the town of Gort in County Galway, Ireland. It is also known as Yeats' Tower because it was once owned and inhabited by the poet William Butler Yeats. -

-

- It has been described as ‘the most important public building in Ireland’ by late Nobel laureate Seamus Heaney. -

-

- History -

-

- The castle was built in the 15th (or possibly 16th) century and originally formed part of the huge estates of the Earls of Clanricarde, from the de Burgo or Burke family. -

-

- The nearby four-arched bridge dates to around 1825. In 1837, the Carrig family was recorded as living in the castle. At the time of Griffith's Valuation (1857), Patrick Carrick was leasing a herd's house, castle and land at Ballylee, barony of Kiltartan, from William Henry Gregory. At the time, the property was valued at £5. -

-

- In the early 1900s, the castle/tower was still owned by the Gregory family and became part of nearby Coole Estate, home of Lady Augusta Gregory, Yeats’ lifelong friend. On the estate, Coole House, where Lady Gregory lived, was the centre for meetings for the Irish literary group, a group composed of a great number of preeminent figures of the day. Near this tower, in Coole Park, began the Irish Literary Revival. -

-

- Thoor Ballylee is also known today as Yeats’ Tower, because in 1916 (or 1917) Yeats purchased the property for the nominal sum of £35 because he was so enchanted with it and especially as it was located in a rural area. From 1921 to 1929, Yeats and his family lived there as it was his monument and symbol: In both aspects, it satisfied his desire for a rooted place in the countryside. The tower retained its original windows in the upper part. Yeats and his architect, Professor William A. Scott, restored the tower for the next two years and installed larger windows in the lower floors. -

-

- As he had an affinity for the Irish language, Yeats dropped the term "castle" in naming the property and replaced it with "Thoor" (Túr), the Irish word for "tower"; thus, the place has been known as Thoor Ballylee. For twelve years, Thoor Ballylee was Yeats’ summer home as it was his country retreat. In a letter to a friend, he wrote, "Everything is so beautiful that to go elsewhere is to leave beauty behind." Consequently, it is no wonder that Yeats was inspired and compelled to create literary works at Ballylee such as poems like The Tower and Coole Park and Ballylee. -

-

- In 1929, Ballylee was abandoned as the Yeats family moved out and it fell to disuse and ruin. -

-

- In 1951, a scene of John Ford's The Quiet Man in which John Wayne and Maureen O'Hara cross a river was shot next to Thoor Ballylee. -

-

- Mary Hanley (1914-1979) was the founder of the Kiltartan Society. A native of Carron, County Clare, Hanley founded the society in 1961 to foster interest in the literary history of the district, especially that of Lady Gregory, Edward Martyn and W.B. Yeats. She was responsible for the restoration of Thoor Ballylee (with the aid of Bord Fáilte and the Yeats family). At the time, the Office of Public Works was owner of the property. Hanley persuaded the poet Padraic Colum to open the castle on Sunday 20 June 1965, the centenary of Yeats’s birth, as Yeats Tower to appear as it was when he lived there and refitted as a Yeats museum containing a collection of first editions and items of furniture. The adjoining miller's cottage became a tea room and shop. This was later expanded by a newly constructed building in the back. -

-

- Today -

-

- Due to its proximity to the Streamstown River, Thoor Ballylee is subject to sporadic flooding. This occurred notably in 1995 and in 2009/2010. In 2009, Thoor Ballylee was extensively damaged by flooding. For a while it appeared that due to the financial problems of the Irish government, no money would be available to repair it. -

-

- Thus only in February 2012 did work by Fáilte Ireland on restoring the tower begin, although no opening date was envisaged at the time. One of the forces behind the decision to repair the tower had been East Galway senator Lorraine Higgins, who argued that a reopened Yeats' Tower would be a boon to local tourism. -

-

- By February 2013 the tower had still not reopened. However, a private group — in cooperation with Fáilte Ireland — had engaged the services of Galway Rural Development, a make-work-scheme, for the maintenance work. -

-

- In 2014, a local community group the "Yeats Thoor Ballylee Society" leased Thoor Ballylee from Fáilte Ireland to develop it into a culture and education centre, in time for the Yeats 150th Anniversary in June 2015. The Society is cooperating with the National Yeats Steering Committee and the Yeats Society to ensure that Thoor Ballylee is an integrated part of the Yeats 2015 celebrations. -

-

- In early December 2015, Storm Desmond devastated parts of Ireland with flooding rain and damaging winds. Thoor Ballylee, and the adjacent cottage, were both damaged by several feet of flood water. -

-

- Architecture -

-

- With four floors, the tower consists of one room on each floor that is connected by a spiral stone stairway built into the seven-foot thickness of the massive outer wall. Each floor has a window that overlooks the Streamstown River that flows alongside the tower. There is a small thatch cottage attached. -

-

- Yeats described the ground-floor chamber as "the pleasantest room I have yet seen, a great wide window opening over the river and a round arched door leading to the thatched hall". He also admired the mural stair, symbolically declaring "This winding, gyring, spiring treadmill of a stair is my ancestral stair; That Goldsmith and the Dean, Berkeley and Burke have traveled there." -

-

- There is a tablet on the wall that commemorates Yeats' sojourn: -

-
-

- - I, the poet William Yeats, -

-

- - With old mill boards and sea-green slates, -

-

- - And smithy work from the Gort forge, -

-

- - Restored this tower for my wife George. -

-

- - And may these characters remain -

-

- - When all is ruin once again. -

+

Thoor Ballylee Castle (Irish Túr Bhaile Uí Laí) is a fortified, 15th-century Anglo-Norman tower house built by the septs de Burgo, or Burke, near the town of Gort in County Galway, Ireland. It is also known as Yeats' Tower because it was once owned and inhabited by the poet William Butler Yeats.

It has been described as ‘the most important public building in Ireland’ by late Nobel laureate Seamus Heaney.

History

The castle was built in the 15th (or possibly 16th) century and originally formed part of the huge estates of the Earls of Clanricarde, from the de Burgo or Burke family.

The nearby four-arched bridge dates to around 1825. In 1837, the Carrig family was recorded as living in the castle. At the time of Griffith's Valuation (1857), Patrick Carrick was leasing a herd's house, castle and land at Ballylee, barony of Kiltartan, from William Henry Gregory. At the time, the property was valued at £5.

In the early 1900s, the castle/tower was still owned by the Gregory family and became part of nearby Coole Estate, home of Lady Augusta Gregory, Yeats’ lifelong friend. On the estate, Coole House, where Lady Gregory lived, was the centre for meetings for the Irish literary group, a group composed of a great number of preeminent figures of the day. Near this tower, in Coole Park, began the Irish Literary Revival.

Thoor Ballylee is also known today as Yeats’ Tower, because in 1916 (or 1917) Yeats purchased the property for the nominal sum of £35 because he was so enchanted with it and especially as it was located in a rural area. From 1921 to 1929, Yeats and his family lived there as it was his monument and symbol: In both aspects, it satisfied his desire for a rooted place in the countryside. The tower retained its original windows in the upper part. Yeats and his architect, Professor William A. Scott, restored the tower for the next two years and installed larger windows in the lower floors.

As he had an affinity for the Irish language, Yeats dropped the term "castle" in naming the property and replaced it with "Thoor" (Túr), the Irish word for "tower"; thus, the place has been known as Thoor Ballylee. For twelve years, Thoor Ballylee was Yeats’ summer home as it was his country retreat. In a letter to a friend, he wrote, "Everything is so beautiful that to go elsewhere is to leave beauty behind." Consequently, it is no wonder that Yeats was inspired and compelled to create literary works at Ballylee such as poems like The Tower and Coole Park and Ballylee.

In 1929, Ballylee was abandoned as the Yeats family moved out and it fell to disuse and ruin.

In 1951, a scene of John Ford's The Quiet Man in which John Wayne and Maureen O'Hara cross a river was shot next to Thoor Ballylee.

Mary Hanley (1914-1979) was the founder of the Kiltartan Society. A native of Carron, County Clare, Hanley founded the society in 1961 to foster interest in the literary history of the district, especially that of Lady Gregory, Edward Martyn and W.B. Yeats. She was responsible for the restoration of Thoor Ballylee (with the aid of Bord Fáilte and the Yeats family). At the time, the Office of Public Works was owner of the property. Hanley persuaded the poet Padraic Colum to open the castle on Sunday 20 June 1965, the centenary of Yeats’s birth, as Yeats Tower to appear as it was when he lived there and refitted as a Yeats museum containing a collection of first editions and items of furniture. The adjoining miller's cottage became a tea room and shop. This was later expanded by a newly constructed building in the back.

Today

Due to its proximity to the Streamstown River, Thoor Ballylee is subject to sporadic flooding. This occurred notably in 1995 and in 2009/2010. In 2009, Thoor Ballylee was extensively damaged by flooding. For a while it appeared that due to the financial problems of the Irish government, no money would be available to repair it.

Thus only in February 2012 did work by Fáilte Ireland on restoring the tower begin, although no opening date was envisaged at the time. One of the forces behind the decision to repair the tower had been East Galway senator Lorraine Higgins, who argued that a reopened Yeats' Tower would be a boon to local tourism.

By February 2013 the tower had still not reopened. However, a private group — in cooperation with Fáilte Ireland — had engaged the services of Galway Rural Development, a make-work-scheme, for the maintenance work.

In 2014, a local community group the "Yeats Thoor Ballylee Society" leased Thoor Ballylee from Fáilte Ireland to develop it into a culture and education centre, in time for the Yeats 150th Anniversary in June 2015. The Society is cooperating with the National Yeats Steering Committee and the Yeats Society to ensure that Thoor Ballylee is an integrated part of the Yeats 2015 celebrations.

In early December 2015, Storm Desmond devastated parts of Ireland with flooding rain and damaging winds. Thoor Ballylee, and the adjacent cottage, were both damaged by several feet of flood water.

Architecture

With four floors, the tower consists of one room on each floor that is connected by a spiral stone stairway built into the seven-foot thickness of the massive outer wall. Each floor has a window that overlooks the Streamstown River that flows alongside the tower. There is a small thatch cottage attached.

Yeats described the ground-floor chamber as "the pleasantest room I have yet seen, a great wide window opening over the river and a round arched door leading to the thatched hall". He also admired the mural stair, symbolically declaring "This winding, gyring, spiring treadmill of a stair is my ancestral stair; That Goldsmith and the Dean, Berkeley and Burke have traveled there."

There is a tablet on the wall that commemorates Yeats' sojourn:

+

I, the poet William Yeats,

+ +

With old mill boards and sea-green slates,

+ +

And smithy work from the Gort forge,

+ +

Restored this tower for my wife George.

+ +

And may these characters remain

+ +

When all is ruin once again.

\ No newline at end of file diff --git a/tests/data/Q748282-en/output.html b/tests/data/Q748282-en/output.html index b525e44..b98dbba 100644 --- a/tests/data/Q748282-en/output.html +++ b/tests/data/Q748282-en/output.html @@ -1,93 +1,18 @@ -

- The Crimean Mountains (Crimean Tatar: Qırım dağları; Ukrainian: Кримські гори; Russian: Крымские горы; Turkish: Yayla Dağları) or Yayla Mountains are a range of mountains running parallel to the south-eastern coast of Crimea, between about 8–13 kilometers (5–8 miles) from the sea. Toward the west, the mountains drop steeply to the Black Sea, and to the east, they change slowly into a steppe landscape. -

-

- The Crimean Mountains consist of three subranges. The highest is the Main Range, which is subdivided into several yaylas or mountain plateaus (yayla or yaylak is Turkic for "alpine meadow"). They are: -

-
    -
  • - Baydar yayla -
  • -
  • - Ai-Petri yayla -
  • -
  • - Yalta yayla -
  • -
  • - Nikita yayla -
  • -
  • - Hurzuf yayla -
  • -
  • - Babugan yayla -
  • -
  • - Chatyr-Dag yayla -
  • -
  • - Dologorukovskaya (Subatkan) yayla -
  • -
  • - Demirci yayla -
  • -
  • - Qarabiy yayla -
  • -
-

- Highest peaks -

-

- The Crimea's highest peak is the Roman-Kosh (Ukrainian: Роман-Кош; Russian: Роман-Кош, Crimean Tatar: Roman Qoş) on the Babugan Yayla at 1,545 metres (5,069ft). Other important peaks over 1,200 metres include: -

-
    -
  • - Demir-Kapu (Ukrainian: Демір-Капу, Russian: Демир-Капу, Crimean Tatar: Demir Qapı) 1,540 m in the Babugan Yayla; -
  • -
  • - Zeytin-Kosh (Ukrainian: Зейтин-Кош; Russian: Зейтин-Кош, Crimean Tatar: Zeytün Qoş) 1,537 m in the Babugan Yayla; -
  • -
  • - Kemal-Egerek (Ukrainian: Кемаль-Егерек, Russian: Кемаль-Эгерек, Crimean Tatar: Kemal Egerek) 1,529 m in the Babugan Yayla; -
  • -
  • - Eklizi-Burun (Ukrainian: Еклізі-Бурун, Russian: Эклизи-Бурун, Crimean Tatar: Eklizi Burun) 1,527 m in the Chatyrdag Yayla; -
  • -
  • - Lapata (Ukrainian: Лапата; Russian: Лапата, Crimean Tatar: Lapata) 1,406 m in the Yaltynska Yayla, Yalta Yaylası; -
  • -
  • - Northern Demirji (Ukrainian: Північний Демірджі, Russian: Северный Демирджи, Crimean Tatar: Şimaliy Demirci) 1,356 m in the Demirci Yayla; -
  • -
  • - Ai-Petri (Ukrainian: Ай-Петрі, Russian: Ай-Петри, Crimean Tatar: Ay Petri) 1,234 m in the Ay Petri Yaylası. -
  • -
-

- Passes and rivers -

-

- The passes over the Crimean Mountains are: (from east to west) -

-
    -
  • - Angarskyi Pass (752m) near Perevalne, on a road from Alushta to Simferopol -
  • -
  • - Baydar Gate (503m) near Foros, connecting Baydar Valley and the sea coast -
  • -
  • - Laspi Pass (350m) near Cape Aya, on a road from Yalta to Sevastopol. -
  • -
-

- Rivers of the Crimean Mountains include the Alma River, Chernaya River, and Salhir River on the northern slope and Uchan-su River on the southern slope which forms the Uchan-su waterfall, and the highest waterfall in Crimea. -

-

- History -

-

- Archaeologists have found the earliest anatomically modern humans in Europe in the Crimean Mountains' Buran-Kaya caves. The fossils are 32,000 years old, with the artifacts linked to the Gravettian culture. The fossils have cut marks suggesting a post-mortem defleshing ritual. -

\ No newline at end of file +

The Crimean Mountains (Crimean Tatar: Qırım dağları; Ukrainian: Кримські гори; Russian: Крымские горы; Turkish: Yayla Dağları) or Yayla Mountains are a range of mountains running parallel to the south-eastern coast of Crimea, between about 8–13 kilometers (5–8 miles) from the sea. Toward the west, the mountains drop steeply to the Black Sea, and to the east, they change slowly into a steppe landscape.

The Crimean Mountains consist of three subranges. The highest is the Main Range, which is subdivided into several yaylas or mountain plateaus (yayla or yaylak is Turkic for "alpine meadow"). They are:

  • Baydar yayla
  • +
  • Ai-Petri yayla
  • +
  • Yalta yayla
  • +
  • Nikita yayla
  • +
  • Hurzuf yayla
  • +
  • Babugan yayla
  • +
  • Chatyr-Dag yayla
  • +
  • Dologorukovskaya (Subatkan) yayla
  • +
  • Demirci yayla
  • +
  • Qarabiy yayla

Highest peaks

The Crimea's highest peak is the Roman-Kosh (Ukrainian: Роман-Кош; Russian: Роман-Кош, Crimean Tatar: Roman Qoş) on the Babugan Yayla at 1,545 metres (5,069ft). Other important peaks over 1,200 metres include:

  • Demir-Kapu (Ukrainian: Демір-Капу, Russian: Демир-Капу, Crimean Tatar: Demir Qapı) 1,540 m in the Babugan Yayla;
  • +
  • Zeytin-Kosh (Ukrainian: Зейтин-Кош; Russian: Зейтин-Кош, Crimean Tatar: Zeytün Qoş) 1,537 m in the Babugan Yayla;
  • +
  • Kemal-Egerek (Ukrainian: Кемаль-Егерек, Russian: Кемаль-Эгерек, Crimean Tatar: Kemal Egerek) 1,529 m in the Babugan Yayla;
  • +
  • Eklizi-Burun (Ukrainian: Еклізі-Бурун, Russian: Эклизи-Бурун, Crimean Tatar: Eklizi Burun) 1,527 m in the Chatyrdag Yayla;
  • +
  • Lapata (Ukrainian: Лапата; Russian: Лапата, Crimean Tatar: Lapata) 1,406 m in the Yaltynska Yayla, Yalta Yaylası;
  • +
  • Northern Demirji (Ukrainian: Північний Демірджі, Russian: Северный Демирджи, Crimean Tatar: Şimaliy Demirci) 1,356 m in the Demirci Yayla;
  • +
  • Ai-Petri (Ukrainian: Ай-Петрі, Russian: Ай-Петри, Crimean Tatar: Ay Petri) 1,234 m in the Ay Petri Yaylası.

Passes and rivers

The passes over the Crimean Mountains are: (from east to west)

  • Angarskyi Pass (752m) near Perevalne, on a road from Alushta to Simferopol
  • +
  • Baydar Gate (503m) near Foros, connecting Baydar Valley and the sea coast
  • +
  • Laspi Pass (350m) near Cape Aya, on a road from Yalta to Sevastopol.

Rivers of the Crimean Mountains include the Alma River, Chernaya River, and Salhir River on the northern slope and Uchan-su River on the southern slope which forms the Uchan-su waterfall, and the highest waterfall in Crimea.

History

Archaeologists have found the earliest anatomically modern humans in Europe in the Crimean Mountains' Buran-Kaya caves. The fossils are 32,000 years old, with the artifacts linked to the Gravettian culture. The fossils have cut marks suggesting a post-mortem defleshing ritual.

\ No newline at end of file diff --git a/tests/html.rs b/tests/html.rs index fdef3f5..32f4581 100644 --- a/tests/html.rs +++ b/tests/html.rs @@ -3,7 +3,7 @@ //! To update the expected output, run the test again with the env variable //! `UPDATE_EXPECT=1` set. //! See https://docs.rs/expect-test/ for more information. -use om_wikiparser::html::{detect_lang, pretty_print, process, process_str, HtmlError}; +use om_wikiparser::html::{detect_lang, process, process_str, HtmlError}; use expect_test::{expect_file, ExpectFile}; use scraper::Html; @@ -12,7 +12,7 @@ fn check(input: &str, expect: ExpectFile) { let html = Html::parse_document(input); let lang = detect_lang(&html).unwrap(); let html = process(html, &lang).unwrap(); - let processed = pretty_print(&html); + let processed = html.html(); expect.assert_eq(&processed); } From 457bd548ecbc2afb722314be5a6b13949c9d4fbd Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Mon, 8 Jul 2024 14:19:16 -0400 Subject: [PATCH 2/2] Preserve whitespace of removed "empty" elements Some articles use non-breaking spaces between quantities and units, which Wikipedia seems to wrap with a span. Elements with no or whitespace-only text were previously removed to prune ``s and parents of other removed elements. This fix preserves the internal whitespace of elements that would otherwise be removed for being "empty". It does not distinguish between "meaningful" whitespace and padding between elements that would be collapsed by HTML formatting rules. It also cannot distinguish between elements that _started_ with only whitespace and nodes that now contain only whitespace after previous steps. The preserved whitespace in the latter case is unlikely to remain because of later processing steps. Fixes #47, fixes organicmaps/organicmaps#8651 Signed-off-by: Evan Lloyd New-Schmidt --- src/html.rs | 9 ++++++--- tests/data/Q748282-en/output.html | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/html.rs b/src/html.rs index d196082..20a9aa6 100644 --- a/src/html.rs +++ b/src/html.rs @@ -202,7 +202,7 @@ pub fn simplify(document: &mut Html, lang: &str) { remove_empty_sections(document); - remove_empty(document); + expand_empty(document); remove_non_element_nodes(document); @@ -305,7 +305,8 @@ fn remove_toplevel_whitespace(document: &mut Html) { remove_ids(document, to_remove.drain(..)); } -fn remove_empty(document: &mut Html) { +/// Expand elements that contain no text or only whitespace, leaving only their contents. +fn expand_empty(document: &mut Html) { let mut to_remove = Vec::new(); for el in document @@ -318,7 +319,9 @@ fn remove_empty(document: &mut Html) { } } - remove_ids(document, to_remove.drain(..)); + for id in to_remove.drain(..) { + expand_id(document, id); + } } fn remove_empty_sections(document: &mut Html) { diff --git a/tests/data/Q748282-en/output.html b/tests/data/Q748282-en/output.html index b98dbba..1ec2e31 100644 --- a/tests/data/Q748282-en/output.html +++ b/tests/data/Q748282-en/output.html @@ -7,7 +7,7 @@
  • Chatyr-Dag yayla
  • Dologorukovskaya (Subatkan) yayla
  • Demirci yayla
  • -
  • Qarabiy yayla
  • Highest peaks

    The Crimea's highest peak is the Roman-Kosh (Ukrainian: Роман-Кош; Russian: Роман-Кош, Crimean Tatar: Roman Qoş) on the Babugan Yayla at 1,545 metres (5,069ft). Other important peaks over 1,200 metres include:

    • Demir-Kapu (Ukrainian: Демір-Капу, Russian: Демир-Капу, Crimean Tatar: Demir Qapı) 1,540 m in the Babugan Yayla;
    • +
    • Qarabiy yayla

    Highest peaks

    The Crimea's highest peak is the Roman-Kosh (Ukrainian: Роман-Кош; Russian: Роман-Кош, Crimean Tatar: Roman Qoş) on the Babugan Yayla at 1,545 metres (5,069 ft). Other important peaks over 1,200 metres include:

    • Demir-Kapu (Ukrainian: Демір-Капу, Russian: Демир-Капу, Crimean Tatar: Demir Qapı) 1,540 m in the Babugan Yayla;
    • Zeytin-Kosh (Ukrainian: Зейтин-Кош; Russian: Зейтин-Кош, Crimean Tatar: Zeytün Qoş) 1,537 m in the Babugan Yayla;
    • Kemal-Egerek (Ukrainian: Кемаль-Егерек, Russian: Кемаль-Эгерек, Crimean Tatar: Kemal Egerek) 1,529 m in the Babugan Yayla;
    • Eklizi-Burun (Ukrainian: Еклізі-Бурун, Russian: Эклизи-Бурун, Crimean Tatar: Eklizi Burun) 1,527 m in the Chatyrdag Yayla;