From 3a1b0ff7942965ec882aeb35fcc29a940a2c4951 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 18 Dec 2023 11:06:32 +0100 Subject: [PATCH] fixup! Do not detect clone entry as duplicated content. --- src/zimcheck/checks.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/zimcheck/checks.cpp b/src/zimcheck/checks.cpp index 6c7aa691..b9b41f23 100644 --- a/src/zimcheck/checks.cpp +++ b/src/zimcheck/checks.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -350,7 +351,7 @@ class ArticleChecker // All article with the same hash will be recorded in the same bucket of // this hash table. - std::map>> hash_main; + std::map> hash_main; zim::ConcurrentCache linkStatusCache; }; @@ -387,7 +388,7 @@ void ArticleChecker::check_item(const zim::Item& item) data = item.getData(); if(checks.isEnabled(TestType::REDUNDANT)) - hash_main[adler32(data)].push_back( {item.getIndex(), item.getClusterIndex(), item.getBlobIndex()} ); + hash_main[adler32(data)].push_back( item.getIndex() ); if (item.getMimetype() != "text/html") return; @@ -487,20 +488,26 @@ void ArticleChecker::detect_redundant_articles() progress.report(); auto l = it.second; while ( !l.empty() ) { - const auto [e1_idx, e1_cluster_idx, e1_blob_idx] = l.front(); + // The way we have constructed `l`, e1 MUST BE an item + const auto e1 = archive.getEntryByPath(l.front()).getItem(); l.pop_front(); - const auto e1 = archive.getEntryByPath(e1_idx); + const auto e1_cluster_idx = e1.getClusterIndex(); + const auto e1_blob_idx = e1.getBlobIndex(); if ( !l.empty() ) { - // The way we have constructed `l`, e1 MUST BEĀ an item - const std::string s1 = e1.getItem().getData(); + std::optional s1; decltype(l) articlesDifferentFromE1; for(auto other : l) { - const auto [e2_idx, e2_cluster_idx, e2_blob_idx] = other; + // The way we have constructed `l`, e2 MUST BE an item + const auto e2 = archive.getEntryByPath(other).getItem(); + const auto e2_cluster_idx = e2.getClusterIndex(); + const auto e2_blob_idx = e2.getBlobIndex(); if (e1_cluster_idx == e2_cluster_idx && e1_blob_idx == e2_blob_idx) { continue; } - auto e2 = archive.getEntryByPath(e2_idx); - std::string s2 = e2.getItem().getData(); + if (!s1) { + s1 = e1.getData(); + } + std::string s2 = e2.getData(); if (s1 != s2 ) { articlesDifferentFromE1.push_back(other); continue;