From 5a223e172298eb83e8b0f6880364b4746ba760e5 Mon Sep 17 00:00:00 2001
From: tballison <tallison@apache.org>
Date: Mon, 28 Aug 2023 10:05:44 -0400
Subject: [PATCH 01/28] NUTCH-2989 -- ElasticIndexWriter should enable auth for
 https, too

---
 .../apache/nutch/indexwriter/elastic/ElasticIndexWriter.java   | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index 053bfd68aa..290d9dfca2 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -210,6 +210,9 @@ public HttpAsyncClientBuilder customizeHttpClient(
           restClientBuilder.setHttpClientConfigCallback(new HttpClientConfigCallback() {
             @Override
             public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) {
+              if (auth) {
+                httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider);
+              }
               // ignore issues with self-signed certificates
               httpClientBuilder.setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE);
               return httpClientBuilder.setSSLContext(sslContext);

From 3bb8b0eeb90f7ba1304ef807cf87f28d0a6341f5 Mon Sep 17 00:00:00 2001
From: tballison <tallison@apache.org>
Date: Wed, 30 Aug 2023 12:20:53 -0400
Subject: [PATCH 02/28] NUTCH-2999 -- upgrade Lucene to latest 8.x throughout

---
 src/plugin/indexer-elastic/plugin.xml         | 28 +++++++++----------
 src/plugin/indexer-opensearch-1x/plugin.xml   | 28 +++++++++----------
 src/plugin/parsefilter-naivebayes/ivy.xml     | 11 +-------
 src/plugin/parsefilter-naivebayes/plugin.xml  | 15 ----------
 src/plugin/scoring-similarity/ivy.xml         |  2 +-
 src/plugin/scoring-similarity/plugin.xml      |  4 +--
 .../similarity/util/LuceneAnalyzerUtil.java   |  4 +--
 .../similarity/util/LuceneTokenizer.java      |  6 ++--
 8 files changed, 37 insertions(+), 61 deletions(-)

diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml
index 387a3ac664..679979d32a 100644
--- a/src/plugin/indexer-elastic/plugin.xml
+++ b/src/plugin/indexer-elastic/plugin.xml
@@ -48,20 +48,20 @@
     <library name="jopt-simple-5.0.2.jar" />
     <library name="lang-mustache-client-7.13.2.jar" />
     <library name="log4j-api-2.11.1.jar" />
-    <library name="lucene-analyzers-common-8.8.2.jar" />
-    <library name="lucene-backward-codecs-8.8.2.jar" />
-    <library name="lucene-core-8.8.2.jar" />
-    <library name="lucene-grouping-8.8.2.jar" />
-    <library name="lucene-highlighter-8.8.2.jar" />
-    <library name="lucene-join-8.8.2.jar" />
-    <library name="lucene-memory-8.8.2.jar" />
-    <library name="lucene-misc-8.8.2.jar" />
-    <library name="lucene-queries-8.8.2.jar" />
-    <library name="lucene-queryparser-8.8.2.jar" />
-    <library name="lucene-sandbox-8.8.2.jar" />
-    <library name="lucene-spatial-extras-8.8.2.jar" />
-    <library name="lucene-spatial3d-8.8.2.jar" />
-    <library name="lucene-suggest-8.8.2.jar" />
+    <library name="lucene-analyzers-common-8.11.2.jar" />
+    <library name="lucene-backward-codecs-8.11.2.jar" />
+    <library name="lucene-core-8.11.2.jar" />
+    <library name="lucene-grouping-8.11.2.jar" />
+    <library name="lucene-highlighter-8.11.2.jar" />
+    <library name="lucene-join-8.11.2.jar" />
+    <library name="lucene-memory-8.11.2.jar" />
+    <library name="lucene-misc-8.11.2.jar" />
+    <library name="lucene-queries-8.11.2.jar" />
+    <library name="lucene-queryparser-8.11.2.jar" />
+    <library name="lucene-sandbox-8.11.2.jar" />
+    <library name="lucene-spatial-extras-8.11.2.jar" />
+    <library name="lucene-spatial3d-8.11.2.jar" />
+    <library name="lucene-suggest-8.11.2.jar" />
     <library name="mapper-extras-client-7.13.2.jar" />
     <library name="parent-join-client-7.13.2.jar" />
     <library name="rank-eval-client-7.13.2.jar" />
diff --git a/src/plugin/indexer-opensearch-1x/plugin.xml b/src/plugin/indexer-opensearch-1x/plugin.xml
index 1bf5affc2f..e1dde463dd 100644
--- a/src/plugin/indexer-opensearch-1x/plugin.xml
+++ b/src/plugin/indexer-opensearch-1x/plugin.xml
@@ -48,20 +48,20 @@
     <library name="jopt-simple-5.0.4.jar" />
     <library name="lang-mustache-client-1.3.8.jar" />
     <library name="log4j-api-2.17.1.jar" />
-    <library name="lucene-analyzers-common-8.10.1.jar" />
-    <library name="lucene-backward-codecs-8.10.1.jar" />
-    <library name="lucene-core-8.10.1.jar" />
-    <library name="lucene-grouping-8.10.1.jar" />
-    <library name="lucene-highlighter-8.10.1.jar" />
-    <library name="lucene-join-8.10.1.jar" />
-    <library name="lucene-memory-8.10.1.jar" />
-    <library name="lucene-misc-8.10.1.jar" />
-    <library name="lucene-queries-8.10.1.jar" />
-    <library name="lucene-queryparser-8.10.1.jar" />
-    <library name="lucene-sandbox-8.10.1.jar" />
-    <library name="lucene-spatial-extras-8.10.1.jar" />
-    <library name="lucene-spatial3d-8.10.1.jar" />
-    <library name="lucene-suggest-8.10.1.jar" />
+    <library name="lucene-analyzers-common-8.11.2.jar" />
+    <library name="lucene-backward-codecs-8.11.2.jar" />
+    <library name="lucene-core-8.11.2.jar" />
+    <library name="lucene-grouping-8.11.2.jar" />
+    <library name="lucene-highlighter-8.11.2.jar" />
+    <library name="lucene-join-8.11.2.jar" />
+    <library name="lucene-memory-8.11.2.jar" />
+    <library name="lucene-misc-8.11.2.jar" />
+    <library name="lucene-queries-8.11.2.jar" />
+    <library name="lucene-queryparser-8.11.2.jar" />
+    <library name="lucene-sandbox-8.11.2.jar" />
+    <library name="lucene-spatial-extras-8.11.2.jar" />
+    <library name="lucene-spatial3d-8.11.2.jar" />
+    <library name="lucene-suggest-8.11.2.jar" />
     <library name="mapper-extras-client-1.3.8.jar" />
     <library name="parent-join-client-1.3.8.jar" />
     <library name="rank-eval-client-1.3.8.jar" />
diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml b/src/plugin/parsefilter-naivebayes/ivy.xml
index ca96ec57bc..c261adac62 100644
--- a/src/plugin/parsefilter-naivebayes/ivy.xml
+++ b/src/plugin/parsefilter-naivebayes/ivy.xml
@@ -35,15 +35,6 @@
     <artifact conf="master"/>
   </publications>
 
-  <dependencies>
+  <dependencies/>
 
-    <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" />
-    <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
-      <exclude org="org.apache.mrunit" name="mrunit"/>
-    </dependency>
-    <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" />
-    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" />
-
-  </dependencies>
-  
 </ivy-module>
diff --git a/src/plugin/parsefilter-naivebayes/plugin.xml b/src/plugin/parsefilter-naivebayes/plugin.xml
index 76f30de6b5..c4983e1c9b 100644
--- a/src/plugin/parsefilter-naivebayes/plugin.xml
+++ b/src/plugin/parsefilter-naivebayes/plugin.xml
@@ -25,21 +25,6 @@
       <library name="parsefilter-naivebayes.jar">
          <export name="*"/>
       </library>
-      <library name="commons-cli-2.0-mahout.jar"/>
-      <library name="commons-lang3-3.1.jar"/>
-      <library name="commons-math3-3.2.jar"/>
-      <library name="guava-14.0.1.jar"/>
-      <library name="jackson-core-asl-1.9.12.jar"/>
-      <library name="jackson-mapper-asl-1.9.12.jar"/>
-      <library name="lucene-analyzers-common-5.5.0.jar"/>
-      <library name="lucene-core-5.5.0.jar"/>
-      <library name="mahout-core-0.9.jar"/>
-      <library name="mahout-math-0.10.1.jar"/>
-      <library name="solr-commons-csv-3.5.0.jar"/>
-      <library name="t-digest-3.1.jar"/>
-      <library name="xmlpull-1.1.3.1.jar"/>
-      <library name="xpp3_min-1.1.4c.jar"/>
-      <library name="xstream-1.4.4.jar"/> 
    </runtime>
 
    <requires>
diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml
index b889f8056c..1acd1d442d 100644
--- a/src/plugin/scoring-similarity/ivy.xml
+++ b/src/plugin/scoring-similarity/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="6.4.1" conf="*->default"/>
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="8.11.2" conf="*->default"/>
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/scoring-similarity/plugin.xml b/src/plugin/scoring-similarity/plugin.xml
index a0353c7189..4ed0592661 100644
--- a/src/plugin/scoring-similarity/plugin.xml
+++ b/src/plugin/scoring-similarity/plugin.xml
@@ -26,8 +26,8 @@
       <library name="scoring-similarity.jar">
          <export name="*"/>
       </library>
-      <library name="lucene-analyzers-common-6.4.1.jar"/>
-      <library name="lucene-core-6.4.1.jar"/>
+      <library name="lucene-analyzers-common-8.11.2.jar"/>
+      <library name="lucene-core-8.11.2.jar"/>
    </runtime>
 
    <requires>
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
index eae5ba5e45..0c1e5fc62d 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
@@ -23,10 +23,10 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
 import org.apache.lucene.analysis.en.PorterStemFilter;
 import org.apache.lucene.analysis.standard.ClassicTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.CharArraySet;
 
 /**
@@ -54,7 +54,7 @@ public static enum StemFilterType { PORTERSTEM_FILTER, ENGLISHMINIMALSTEM_FILTER
   public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean useStopFilter) {
     LuceneAnalyzerUtil.stemFilterType = stemFilterType;
     if(useStopFilter) {
-      stopSet = StandardAnalyzer.STOP_WORDS_SET;
+      stopSet = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET;
     }
     else {
       stopSet = null;
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index d09af8244d..8567a39b2c 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -23,10 +23,10 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
 import org.apache.lucene.analysis.en.PorterStemFilter;
 import org.apache.lucene.analysis.standard.ClassicTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.CharArraySet;
@@ -56,7 +56,7 @@ public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopF
     this.tokenizer = tokenizer;
     this.stemFilterType = stemFilterType;
     if(useStopFilter) {
-      stopSet = StandardAnalyzer.STOP_WORDS_SET;
+      stopSet = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET;
     }
     tokenStream = createTokenStream(content);
   }
@@ -78,7 +78,7 @@ public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> sto
     this.tokenizer = tokenizer;
     this.stemFilterType = stemFilterType;
     if(addToDefault) {
-      CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);;
+      CharArraySet stopSet = CharArraySet.copy(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
       for(String word : stopWords){
         stopSet.add(word);
       }

From cf74770f9642356d4c2cdc9d6f41aaf8a8928bcf Mon Sep 17 00:00:00 2001
From: tballison <tallison@apache.org>
Date: Wed, 30 Aug 2023 14:05:24 -0400
Subject: [PATCH 03/28] NUTCH-2978, upgrade to slf4j2 throughout, first steps

---
 ivy/ivy.xml                     |  19 +-
 src/plugin/any23/ivy.xml        |   4 +-
 src/plugin/any23/plugin.xml     | 325 ++++++++++++++++----------------
 src/plugin/build-plugin.xml     |   4 +-
 src/plugin/lib-rabbitmq/ivy.xml |   4 +-
 5 files changed, 181 insertions(+), 175 deletions(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 18a6df2302..6366b891ce 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -36,9 +36,9 @@
 	</publications>
 
 	<dependencies>
-		<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.18.0" conf="*->master" />
-		<dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.18.0" conf="*->master" />
-		<dependency org="org.apache.logging.log4j" name="log4j-slf4j-impl" rev="2.18.0" conf="*->master" />
+		<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.20.0" conf="*->master" />
+    <dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.20.0" conf="*->master" />
+		<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.20.0" conf="*->master" />
 		<dependency org="org.slf4j" name="slf4j-api" rev="1.7.36" conf="*->master" />
 
 		<dependency org="org.apache.commons" name="commons-lang3" rev="3.12.0" conf="*->default" />
@@ -51,10 +51,15 @@
 
 		<!-- Hadoop Dependencies -->
 		<dependency org="org.apache.hadoop" name="hadoop-common" rev="3.3.4" conf="*->default" />
-		<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="3.3.4" conf="*->default" />
-		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="3.3.4" conf="*->default" />
-		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.3.4" conf="*->default" />
-		<!-- End of Hadoop Dependencies -->
+		<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="3.3.4" conf="*->default">
+			<exclude org="org.slf4j" name="*" />
+		</dependency>
+		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="3.3.4" conf="*->default">
+		  <exclude org="org.slf4j" name="*" />
+		</dependency>
+		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.3.4" conf="*->default">
+		  <exclude org="org.slf4j" name="*" />
+		</dependency><!-- End of Hadoop Dependencies -->
 
 		<dependency org="org.apache.tika" name="tika-core" rev="2.3.0" />
 
diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml
index 3b755ee3fa..99f6d2da85 100644
--- a/src/plugin/any23/ivy.xml
+++ b/src/plugin/any23/ivy.xml
@@ -39,8 +39,8 @@
     <dependency org="org.apache.any23" name="apache-any23-core" rev="2.7" conf="*->default">
       <exclude org="org.apache.commons" name="commons-lang" />
       <exclude org="org.apache.commons" name="commons-compress" />
-      <exclude org="org.slf4j" name="slf4j-log4j12" />
-      <exclude org="org.slf4j" name="slf4j-api" />
+      <exclude org="org.slf4j" name="*" />
+      <exclude org="org.apache.logging.log4j" name="*" />
       <exclude org="xerces" />
     </dependency>
     <dependency org="org.apache.commons" name="commons-rdf-api" rev="0.5.0" conf="*->default"/>
diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml
index dae8c47aa3..25e1b42144 100644
--- a/src/plugin/any23/plugin.xml
+++ b/src/plugin/any23/plugin.xml
@@ -26,170 +26,167 @@
       <export name="*"/>
     </library>
       <!-- Begin Any23 dependencies -->
-      <library name="apache-any23-api-2.7.jar"/>
-      <library name="apache-any23-core-2.7.jar"/>
-      <library name="apache-any23-csvutils-2.7.jar"/>
-      <library name="apache-any23-encoding-2.7.jar"/>
-      <library name="apache-any23-mime-2.7.jar"/>
-      <library name="apache-mime4j-core-0.8.4.jar"/>
-      <library name="apache-mime4j-dom-0.8.4.jar"/>
-      <library name="asm-9.2.jar"/>
-      <library name="bcmail-jdk15on-1.70.jar"/>
-      <library name="bcpkix-jdk15on-1.70.jar"/>
-      <library name="bcutil-jdk15on-1.70.jar"/>
-      <library name="biweekly-0.6.6.jar"/>
-      <library name="boilerpipe-1.1.0.jar"/>
-      <library name="caffeine-2.8.1.jar"/>
-      <library name="checker-qual-3.1.0.jar"/>
-      <library name="commons-codec-1.15.jar"/>
-      <library name="commons-collections4-4.4.jar"/>
-      <library name="commons-csv-1.9.0.jar"/>
-      <library name="commons-exec-1.3.jar"/>
-      <library name="commons-io-2.11.0.jar"/>
-      <library name="commons-lang3-3.12.0.jar"/>
-      <library name="commons-logging-1.2.jar"/>
-      <library name="commons-math3-3.6.1.jar"/>
-      <library name="commons-rdf-api-0.5.0.jar"/>
-      <library name="commons-text-1.9.jar"/>
-      <library name="curvesapi-1.06.jar"/>
-      <library name="dd-plist-1.23.jar"/>
-      <library name="dec-0.1.2.jar"/>
-      <library name="error_prone_annotations-2.3.4.jar"/>
-      <library name="f8-1.1.jar"/>
-      <library name="failureaccess-1.0.1.jar"/>
-      <library name="fluent-hc-4.5.13.jar"/>
-      <library name="fontbox-2.0.25.jar"/>
-      <library name="guava-30.1.1-jre.jar"/>
-      <library name="hppcrt-0.7.5.jar"/>
-      <library name="httpclient-4.5.13.jar"/>
-      <library name="httpclient-cache-4.5.13.jar"/>
-      <library name="httpclient-osgi-4.5.13.jar"/>
-      <library name="httpcore-4.4.15.jar"/>
-      <library name="httpcore-nio-4.4.14.jar"/>
-      <library name="httpcore-osgi-4.4.14.jar"/>
-      <library name="httpmime-4.5.13.jar"/>
-      <library name="istack-commons-runtime-3.0.12.jar"/>
-      <library name="jackcess-4.0.1.jar"/>
-      <library name="jackcess-encrypt-4.0.1.jar"/>
-      <library name="jackson-annotations-2.11.4.jar"/>
-      <library name="jackson-core-2.12.1.jar"/>
-      <library name="jackson-databind-2.11.4.jar"/>
-      <library name="jai-imageio-core-1.4.0.jar"/>
-      <library name="jakarta.activation-1.2.2.jar"/>
-      <library name="jakarta.xml.bind-api-2.3.3.jar"/>
-      <library name="java-libpst-0.9.3.jar"/>
-      <library name="javax.activation-api-1.2.0.jar"/>
-      <library name="javax.inject-1.jar"/>
-      <library name="jaxb-api-2.3.1.jar"/>
-      <library name="jaxb-runtime-2.3.5.jar"/>
-      <library name="jbig2-imageio-3.0.3.jar"/>
-      <library name="jcl-over-slf4j-1.7.35.jar"/>
-      <library name="jcommander-1.82.jar"/>
-      <library name="jdom2-2.0.6.1.jar"/>
-      <library name="jempbox-1.8.16.jar"/>
-      <library name="jhighlight-1.0.3.jar"/>
-      <library name="jmatio-1.5.jar"/>
-      <library name="jsonld-java-0.13.4.jar"/>
-      <library name="jsoup-1.14.3.jar"/>
-      <library name="jsr305-3.0.2.jar"/>
-      <library name="juniversalchardet-1.0.3.jar"/>
-      <library name="junrar-7.4.1.jar"/>
-      <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
-      <library name="log4j-api-2.17.1.jar"/>
-      <library name="log4j-core-2.17.1.jar"/>
-      <library name="log4j-slf4j-impl-2.17.1.jar"/>
-      <library name="mapdb-1.0.8.jar"/>
-      <library name="metadata-extractor-2.16.0.jar"/>
-      <library name="owlapi-api-5.1.20.jar"/>
-      <library name="owlapi-apibinding-5.1.20.jar"/>
-      <library name="owlapi-impl-5.1.20.jar"/>
-      <library name="owlapi-oboformat-5.1.20.jar"/>
-      <library name="owlapi-parsers-5.1.20.jar"/>
-      <library name="owlapi-rio-5.1.20.jar"/>
-      <library name="owlapi-tools-5.1.20.jar"/>
-      <library name="parso-2.0.14.jar"/>
-      <library name="pdfbox-2.0.25.jar"/>
-      <library name="pdfbox-debugger-2.0.25.jar"/>
-      <library name="pdfbox-tools-2.0.25.jar"/>
-      <library name="poi-5.2.0.jar"/>
-      <library name="poi-ooxml-5.2.0.jar"/>
-      <library name="poi-ooxml-lite-5.2.0.jar"/>
-      <library name="poi-scratchpad-5.2.0.jar"/>
-      <library name="rdf4j-http-client-3.7.4.jar"/>
-      <library name="rdf4j-http-protocol-3.7.4.jar"/>
-      <library name="rdf4j-model-3.7.4.jar"/>
-      <library name="rdf4j-model-api-3.7.4.jar"/>
-      <library name="rdf4j-model-vocabulary-3.7.4.jar"/>
-      <library name="rdf4j-query-3.7.4.jar"/>
-      <library name="rdf4j-queryalgebra-evaluation-3.7.4.jar"/>
-      <library name="rdf4j-queryalgebra-model-3.7.4.jar"/>
-      <library name="rdf4j-queryparser-api-3.7.4.jar"/>
-      <library name="rdf4j-queryparser-sparql-3.7.4.jar"/>
-      <library name="rdf4j-queryresultio-api-3.7.4.jar"/>
-      <library name="rdf4j-queryresultio-binary-3.7.4.jar"/>
-      <library name="rdf4j-queryresultio-sparqlxml-3.7.4.jar"/>
-      <library name="rdf4j-repository-api-3.7.4.jar"/>
-      <library name="rdf4j-repository-sail-3.7.4.jar"/>
-      <library name="rdf4j-repository-sparql-3.7.4.jar"/>
-      <library name="rdf4j-rio-api-3.7.4.jar"/>
-      <library name="rdf4j-rio-binary-3.7.4.jar"/>
-      <library name="rdf4j-rio-datatypes-3.7.4.jar"/>
-      <library name="rdf4j-rio-hdt-3.7.4.jar"/>
-      <library name="rdf4j-rio-jsonld-3.7.4.jar"/>
-      <library name="rdf4j-rio-languages-3.7.4.jar"/>
-      <library name="rdf4j-rio-n3-3.7.4.jar"/>
-      <library name="rdf4j-rio-nquads-3.7.4.jar"/>
-      <library name="rdf4j-rio-ntriples-3.7.4.jar"/>
-      <library name="rdf4j-rio-rdfjson-3.7.4.jar"/>
-      <library name="rdf4j-rio-rdfxml-3.7.4.jar"/>
-      <library name="rdf4j-rio-trig-3.7.4.jar"/>
-      <library name="rdf4j-rio-trix-3.7.4.jar"/>
-      <library name="rdf4j-rio-turtle-3.7.4.jar"/>
-      <library name="rdf4j-sail-api-3.7.4.jar"/>
-      <library name="rdf4j-sail-base-3.7.4.jar"/>
-      <library name="rdf4j-sail-memory-3.7.4.jar"/>
-      <library name="rdf4j-util-3.7.4.jar"/>
-      <library name="rome-1.18.0.jar"/>
-      <library name="rome-utils-1.18.0.jar"/>
-      <library name="semargl-core-0.7.jar"/>
-      <library name="semargl-rdf-0.7.jar"/>
-      <library name="semargl-rdf4j-0.7.jar"/>
-      <library name="semargl-rdfa-0.7.jar"/>
-      <library name="snakeyaml-1.30.jar"/>
-      <library name="SparseBitSet-1.2.jar"/>
-      <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-core-2.3.0.jar"/>
-      <library name="tika-parser-apple-module-2.3.0.jar"/>
-      <library name="tika-parser-audiovideo-module-2.3.0.jar"/>
-      <library name="tika-parser-cad-module-2.3.0.jar"/>
-      <library name="tika-parser-code-module-2.3.0.jar"/>
-      <library name="tika-parser-crypto-module-2.3.0.jar"/>
-      <library name="tika-parser-digest-commons-2.3.0.jar"/>
-      <library name="tika-parser-font-module-2.3.0.jar"/>
-      <library name="tika-parser-html-commons-2.3.0.jar"/>
-      <library name="tika-parser-html-module-2.3.0.jar"/>
-      <library name="tika-parser-image-module-2.3.0.jar"/>
-      <library name="tika-parser-mail-commons-2.3.0.jar"/>
-      <library name="tika-parser-mail-module-2.3.0.jar"/>
-      <library name="tika-parser-microsoft-module-2.3.0.jar"/>
-      <library name="tika-parser-miscoffice-module-2.3.0.jar"/>
-      <library name="tika-parser-news-module-2.3.0.jar"/>
-      <library name="tika-parser-ocr-module-2.3.0.jar"/>
-      <library name="tika-parser-pdf-module-2.3.0.jar"/>
-      <library name="tika-parser-pkg-module-2.3.0.jar"/>
-      <library name="tika-parsers-standard-package-2.3.0.jar"/>
-      <library name="tika-parser-text-module-2.3.0.jar"/>
-      <library name="tika-parser-xml-module-2.3.0.jar"/>
-      <library name="tika-parser-xmp-commons-2.3.0.jar"/>
-      <library name="tika-parser-zip-commons-2.3.0.jar"/>
-      <library name="txw2-2.3.5.jar"/>
-      <library name="vinnie-2.0.2.jar"/>
-      <library name="vorbis-java-core-0.8.jar"/>
-      <library name="vorbis-java-tika-0.8.jar"/>
-      <library name="xmlbeans-5.0.3.jar"/>
-      <library name="xmpbox-2.0.25.jar"/>
-      <library name="xmpcore-6.1.11.jar"/>
-      <library name="xz-1.9.jar"/>
+    <library name="SparseBitSet-1.2.jar"/>
+    <library name="apache-any23-api-2.7.jar"/>
+    <library name="apache-any23-core-2.7.jar"/>
+    <library name="apache-any23-csvutils-2.7.jar"/>
+    <library name="apache-any23-encoding-2.7.jar"/>
+    <library name="apache-any23-mime-2.7.jar"/>
+    <library name="apache-mime4j-core-0.8.4.jar"/>
+    <library name="apache-mime4j-dom-0.8.4.jar"/>
+    <library name="asm-9.2.jar"/>
+    <library name="bcmail-jdk15on-1.70.jar"/>
+    <library name="bcpkix-jdk15on-1.70.jar"/>
+    <library name="bcutil-jdk15on-1.70.jar"/>
+    <library name="biweekly-0.6.6.jar"/>
+    <library name="boilerpipe-1.1.0.jar"/>
+    <library name="caffeine-2.8.1.jar"/>
+    <library name="checker-qual-3.1.0.jar"/>
+    <library name="commons-codec-1.15.jar"/>
+    <library name="commons-collections4-4.4.jar"/>
+    <library name="commons-csv-1.9.0.jar"/>
+    <library name="commons-exec-1.3.jar"/>
+    <library name="commons-io-2.11.0.jar"/>
+    <library name="commons-lang3-3.12.0.jar"/>
+    <library name="commons-logging-1.2.jar"/>
+    <library name="commons-math3-3.6.1.jar"/>
+    <library name="commons-rdf-api-0.5.0.jar"/>
+    <library name="commons-text-1.9.jar"/>
+    <library name="curvesapi-1.06.jar"/>
+    <library name="dd-plist-1.23.jar"/>
+    <library name="dec-0.1.2.jar"/>
+    <library name="error_prone_annotations-2.3.4.jar"/>
+    <library name="f8-1.1.jar"/>
+    <library name="failureaccess-1.0.1.jar"/>
+    <library name="fluent-hc-4.5.13.jar"/>
+    <library name="fontbox-2.0.25.jar"/>
+    <library name="guava-30.1.1-jre.jar"/>
+    <library name="hppcrt-0.7.5.jar"/>
+    <library name="httpclient-4.5.13.jar"/>
+    <library name="httpclient-cache-4.5.13.jar"/>
+    <library name="httpclient-osgi-4.5.13.jar"/>
+    <library name="httpcore-4.4.15.jar"/>
+    <library name="httpcore-nio-4.4.14.jar"/>
+    <library name="httpcore-osgi-4.4.14.jar"/>
+    <library name="httpmime-4.5.13.jar"/>
+    <library name="istack-commons-runtime-3.0.12.jar"/>
+    <library name="jackcess-4.0.1.jar"/>
+    <library name="jackcess-encrypt-4.0.1.jar"/>
+    <library name="jackson-annotations-2.11.4.jar"/>
+    <library name="jackson-core-2.12.1.jar"/>
+    <library name="jackson-databind-2.11.4.jar"/>
+    <library name="jai-imageio-core-1.4.0.jar"/>
+    <library name="jakarta.activation-1.2.2.jar"/>
+    <library name="jakarta.xml.bind-api-2.3.3.jar"/>
+    <library name="java-libpst-0.9.3.jar"/>
+    <library name="javax.activation-api-1.2.0.jar"/>
+    <library name="javax.inject-1.jar"/>
+    <library name="jaxb-api-2.3.1.jar"/>
+    <library name="jaxb-runtime-2.3.5.jar"/>
+    <library name="jbig2-imageio-3.0.3.jar"/>
+    <library name="jcommander-1.82.jar"/>
+    <library name="jdom2-2.0.6.1.jar"/>
+    <library name="jempbox-1.8.16.jar"/>
+    <library name="jhighlight-1.0.3.jar"/>
+    <library name="jmatio-1.5.jar"/>
+    <library name="jsonld-java-0.13.4.jar"/>
+    <library name="jsoup-1.14.3.jar"/>
+    <library name="jsr305-3.0.2.jar"/>
+    <library name="juniversalchardet-1.0.3.jar"/>
+    <library name="junrar-7.4.1.jar"/>
+    <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
+    <library name="mapdb-1.0.8.jar"/>
+    <library name="metadata-extractor-2.16.0.jar"/>
+    <library name="owlapi-api-5.1.20.jar"/>
+    <library name="owlapi-apibinding-5.1.20.jar"/>
+    <library name="owlapi-impl-5.1.20.jar"/>
+    <library name="owlapi-oboformat-5.1.20.jar"/>
+    <library name="owlapi-parsers-5.1.20.jar"/>
+    <library name="owlapi-rio-5.1.20.jar"/>
+    <library name="owlapi-tools-5.1.20.jar"/>
+    <library name="parso-2.0.14.jar"/>
+    <library name="pdfbox-2.0.25.jar"/>
+    <library name="pdfbox-debugger-2.0.25.jar"/>
+    <library name="pdfbox-tools-2.0.25.jar"/>
+    <library name="poi-5.2.0.jar"/>
+    <library name="poi-ooxml-5.2.0.jar"/>
+    <library name="poi-ooxml-lite-5.2.0.jar"/>
+    <library name="poi-scratchpad-5.2.0.jar"/>
+    <library name="rdf4j-http-client-3.7.4.jar"/>
+    <library name="rdf4j-http-protocol-3.7.4.jar"/>
+    <library name="rdf4j-model-3.7.4.jar"/>
+    <library name="rdf4j-model-api-3.7.4.jar"/>
+    <library name="rdf4j-model-vocabulary-3.7.4.jar"/>
+    <library name="rdf4j-query-3.7.4.jar"/>
+    <library name="rdf4j-queryalgebra-evaluation-3.7.4.jar"/>
+    <library name="rdf4j-queryalgebra-model-3.7.4.jar"/>
+    <library name="rdf4j-queryparser-api-3.7.4.jar"/>
+    <library name="rdf4j-queryparser-sparql-3.7.4.jar"/>
+    <library name="rdf4j-queryresultio-api-3.7.4.jar"/>
+    <library name="rdf4j-queryresultio-binary-3.7.4.jar"/>
+    <library name="rdf4j-queryresultio-sparqlxml-3.7.4.jar"/>
+    <library name="rdf4j-repository-api-3.7.4.jar"/>
+    <library name="rdf4j-repository-sail-3.7.4.jar"/>
+    <library name="rdf4j-repository-sparql-3.7.4.jar"/>
+    <library name="rdf4j-rio-api-3.7.4.jar"/>
+    <library name="rdf4j-rio-binary-3.7.4.jar"/>
+    <library name="rdf4j-rio-datatypes-3.7.4.jar"/>
+    <library name="rdf4j-rio-hdt-3.7.4.jar"/>
+    <library name="rdf4j-rio-jsonld-3.7.4.jar"/>
+    <library name="rdf4j-rio-languages-3.7.4.jar"/>
+    <library name="rdf4j-rio-n3-3.7.4.jar"/>
+    <library name="rdf4j-rio-nquads-3.7.4.jar"/>
+    <library name="rdf4j-rio-ntriples-3.7.4.jar"/>
+    <library name="rdf4j-rio-rdfjson-3.7.4.jar"/>
+    <library name="rdf4j-rio-rdfxml-3.7.4.jar"/>
+    <library name="rdf4j-rio-trig-3.7.4.jar"/>
+    <library name="rdf4j-rio-trix-3.7.4.jar"/>
+    <library name="rdf4j-rio-turtle-3.7.4.jar"/>
+    <library name="rdf4j-sail-api-3.7.4.jar"/>
+    <library name="rdf4j-sail-base-3.7.4.jar"/>
+    <library name="rdf4j-sail-memory-3.7.4.jar"/>
+    <library name="rdf4j-util-3.7.4.jar"/>
+    <library name="rome-1.18.0.jar"/>
+    <library name="rome-utils-1.18.0.jar"/>
+    <library name="semargl-core-0.7.jar"/>
+    <library name="semargl-rdf-0.7.jar"/>
+    <library name="semargl-rdf4j-0.7.jar"/>
+    <library name="semargl-rdfa-0.7.jar"/>
+    <library name="snakeyaml-1.30.jar"/>
+    <library name="tagsoup-1.2.1.jar"/>
+    <library name="tika-core-2.3.0.jar"/>
+    <library name="tika-parser-apple-module-2.3.0.jar"/>
+    <library name="tika-parser-audiovideo-module-2.3.0.jar"/>
+    <library name="tika-parser-cad-module-2.3.0.jar"/>
+    <library name="tika-parser-code-module-2.3.0.jar"/>
+    <library name="tika-parser-crypto-module-2.3.0.jar"/>
+    <library name="tika-parser-digest-commons-2.3.0.jar"/>
+    <library name="tika-parser-font-module-2.3.0.jar"/>
+    <library name="tika-parser-html-commons-2.3.0.jar"/>
+    <library name="tika-parser-html-module-2.3.0.jar"/>
+    <library name="tika-parser-image-module-2.3.0.jar"/>
+    <library name="tika-parser-mail-commons-2.3.0.jar"/>
+    <library name="tika-parser-mail-module-2.3.0.jar"/>
+    <library name="tika-parser-microsoft-module-2.3.0.jar"/>
+    <library name="tika-parser-miscoffice-module-2.3.0.jar"/>
+    <library name="tika-parser-news-module-2.3.0.jar"/>
+    <library name="tika-parser-ocr-module-2.3.0.jar"/>
+    <library name="tika-parser-pdf-module-2.3.0.jar"/>
+    <library name="tika-parser-pkg-module-2.3.0.jar"/>
+    <library name="tika-parser-text-module-2.3.0.jar"/>
+    <library name="tika-parser-xml-module-2.3.0.jar"/>
+    <library name="tika-parser-xmp-commons-2.3.0.jar"/>
+    <library name="tika-parser-zip-commons-2.3.0.jar"/>
+    <library name="tika-parsers-standard-package-2.3.0.jar"/>
+    <library name="txw2-2.3.5.jar"/>
+    <library name="vinnie-2.0.2.jar"/>
+    <library name="vorbis-java-core-0.8.jar"/>
+    <library name="vorbis-java-tika-0.8.jar"/>
+    <library name="xmlbeans-5.0.3.jar"/>
+    <library name="xmpbox-2.0.25.jar"/>
+    <library name="xmpcore-6.1.11.jar"/>
+    <library name="xz-1.9.jar"/>
+
       <!-- End Any23 dependencies -->
   </runtime>
 
diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml
index f6e87e8057..3f0d9ca44a 100755
--- a/src/plugin/build-plugin.xml
+++ b/src/plugin/build-plugin.xml
@@ -265,5 +265,7 @@
     <delete includeemptydirs="true" dir="${build.dir}"/>
     <delete includeemptydirs="true" dir="${deploy.dir}"/>
   </target>
-
+  <target name="dependencytree" depends="resolve-default" description="Show dependency tree">
+    <ivy:dependencytree />
+  </target>
 </project>
diff --git a/src/plugin/lib-rabbitmq/ivy.xml b/src/plugin/lib-rabbitmq/ivy.xml
index 28665978b0..1b6ceac371 100644
--- a/src/plugin/lib-rabbitmq/ivy.xml
+++ b/src/plugin/lib-rabbitmq/ivy.xml
@@ -36,7 +36,9 @@
   </publications>
 
   <dependencies>
-    <dependency org="com.rabbitmq" name="amqp-client" rev="5.2.0"/>
+    <dependency org="com.rabbitmq" name="amqp-client" rev="5.2.0">
+      <exclude org="org.slf4j" name="*" />
+    </dependency>
     <dependency org="com.google.code.gson" name="gson" rev="2.8.4"/>
   </dependencies>
   

From 8d9c77fd1b044f7c8fc51b70e34321cb9260cfbb Mon Sep 17 00:00:00 2001
From: tballison <tallison@apache.org>
Date: Wed, 30 Aug 2023 14:34:48 -0400
Subject: [PATCH 04/28] NUTCH-2999 -- upgrade lucene to latest 8.x throughout

---
 src/plugin/indexer-elastic/ivy.xml            | 15 ++++
 src/plugin/indexer-elastic/plugin.xml         | 89 ++++++++++---------
 .../howto_upgrade_opensearch.txt              | 33 +++++++
 src/plugin/indexer-opensearch-1x/ivy.xml      | 15 ++++
 src/plugin/indexer-opensearch-1x/plugin.xml   | 88 +++++++++---------
 5 files changed, 152 insertions(+), 88 deletions(-)
 create mode 100644 src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt

diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml
index abdcceae29..e5cdfdf656 100644
--- a/src/plugin/indexer-elastic/ivy.xml
+++ b/src/plugin/indexer-elastic/ivy.xml
@@ -40,7 +40,22 @@
       <exclude org="commons-logging" name="commons-logging" />
       <exclude org="com.tdunning" name="t-digest" />
       <exclude org="org.apache.logging.log4j" name="log4j-api" />
+      <exclude org="org.apache.lucene" name="*"/>
     </dependency>
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-backward-codecs" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-core" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-grouping" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-highlighter" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-join" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-memory" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-misc" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-queries" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-queryparser" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-sandbox" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-spatial-extras" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-spatial3d" rev="8.11.2"/>
+    <dependency org="org.apache.lucene" name="lucene-suggest" rev="8.11.2"/>
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml
index 679979d32a..fc3723a608 100644
--- a/src/plugin/indexer-elastic/plugin.xml
+++ b/src/plugin/indexer-elastic/plugin.xml
@@ -22,50 +22,51 @@
     </library>
     <!-- Elastic Rest Client Dependencies -->
     <!-- end of Elastic Rest Client dependencies -->
-    <library name="HdrHistogram-2.1.9.jar" />
-    <library name="aggs-matrix-stats-client-7.13.2.jar" />
-    <library name="compiler-0.9.6.jar" />
-    <library name="elasticsearch-7.13.2.jar" />
-    <library name="elasticsearch-cli-7.13.2.jar" />
-    <library name="elasticsearch-core-7.13.2.jar" />
-    <library name="elasticsearch-geo-7.13.2.jar" />
-    <library name="elasticsearch-plugin-classloader-7.13.2.jar" />
-    <library name="elasticsearch-rest-client-7.13.2.jar" />
-    <library name="elasticsearch-rest-high-level-client-7.13.2.jar" />
-    <library name="elasticsearch-secure-sm-7.13.2.jar" />
-    <library name="elasticsearch-x-content-7.13.2.jar" />
-    <library name="hppc-0.8.1.jar" />
-    <library name="httpasyncclient-4.1.4.jar" />
-    <library name="httpclient-4.5.10.jar" />
-    <library name="httpcore-4.4.12.jar" />
-    <library name="httpcore-nio-4.4.12.jar" />
-    <library name="jackson-core-2.10.4.jar" />
-    <library name="jackson-dataformat-cbor-2.10.4.jar" />
-    <library name="jackson-dataformat-smile-2.10.4.jar" />
-    <library name="jackson-dataformat-yaml-2.10.4.jar" />
-    <library name="jna-5.7.0-1.jar" />
-    <library name="joda-time-2.10.10.jar" />
-    <library name="jopt-simple-5.0.2.jar" />
-    <library name="lang-mustache-client-7.13.2.jar" />
-    <library name="log4j-api-2.11.1.jar" />
-    <library name="lucene-analyzers-common-8.11.2.jar" />
-    <library name="lucene-backward-codecs-8.11.2.jar" />
-    <library name="lucene-core-8.11.2.jar" />
-    <library name="lucene-grouping-8.11.2.jar" />
-    <library name="lucene-highlighter-8.11.2.jar" />
-    <library name="lucene-join-8.11.2.jar" />
-    <library name="lucene-memory-8.11.2.jar" />
-    <library name="lucene-misc-8.11.2.jar" />
-    <library name="lucene-queries-8.11.2.jar" />
-    <library name="lucene-queryparser-8.11.2.jar" />
-    <library name="lucene-sandbox-8.11.2.jar" />
-    <library name="lucene-spatial-extras-8.11.2.jar" />
-    <library name="lucene-spatial3d-8.11.2.jar" />
-    <library name="lucene-suggest-8.11.2.jar" />
-    <library name="mapper-extras-client-7.13.2.jar" />
-    <library name="parent-join-client-7.13.2.jar" />
-    <library name="rank-eval-client-7.13.2.jar" />
-    <library name="snakeyaml-1.26.jar" />
+    <library name="HdrHistogram-2.1.9.jar"/>
+    <library name="aggs-matrix-stats-client-7.13.2.jar"/>
+    <library name="compiler-0.9.6.jar"/>
+    <library name="elasticsearch-7.13.2.jar"/>
+    <library name="elasticsearch-cli-7.13.2.jar"/>
+    <library name="elasticsearch-core-7.13.2.jar"/>
+    <library name="elasticsearch-geo-7.13.2.jar"/>
+    <library name="elasticsearch-plugin-classloader-7.13.2.jar"/>
+    <library name="elasticsearch-rest-client-7.13.2.jar"/>
+    <library name="elasticsearch-rest-high-level-client-7.13.2.jar"/>
+    <library name="elasticsearch-secure-sm-7.13.2.jar"/>
+    <library name="elasticsearch-x-content-7.13.2.jar"/>
+    <library name="hppc-0.8.1.jar"/>
+    <library name="httpasyncclient-4.1.4.jar"/>
+    <library name="httpclient-4.5.10.jar"/>
+    <library name="httpcore-4.4.12.jar"/>
+    <library name="httpcore-nio-4.4.12.jar"/>
+    <library name="jackson-core-2.10.4.jar"/>
+    <library name="jackson-dataformat-cbor-2.10.4.jar"/>
+    <library name="jackson-dataformat-smile-2.10.4.jar"/>
+    <library name="jackson-dataformat-yaml-2.10.4.jar"/>
+    <library name="jna-5.7.0-1.jar"/>
+    <library name="joda-time-2.10.10.jar"/>
+    <library name="jopt-simple-5.0.2.jar"/>
+    <library name="lang-mustache-client-7.13.2.jar"/>
+    <library name="lucene-analyzers-common-8.11.2.jar"/>
+    <library name="lucene-backward-codecs-8.11.2.jar"/>
+    <library name="lucene-core-8.11.2.jar"/>
+    <library name="lucene-grouping-8.11.2.jar"/>
+    <library name="lucene-highlighter-8.11.2.jar"/>
+    <library name="lucene-join-8.11.2.jar"/>
+    <library name="lucene-memory-8.11.2.jar"/>
+    <library name="lucene-misc-8.11.2.jar"/>
+    <library name="lucene-queries-8.11.2.jar"/>
+    <library name="lucene-queryparser-8.11.2.jar"/>
+    <library name="lucene-sandbox-8.11.2.jar"/>
+    <library name="lucene-spatial-extras-8.11.2.jar"/>
+    <library name="lucene-spatial3d-8.11.2.jar"/>
+    <library name="lucene-suggest-8.11.2.jar"/>
+    <library name="mapper-extras-client-7.13.2.jar"/>
+    <library name="parent-join-client-7.13.2.jar"/>
+    <library name="rank-eval-client-7.13.2.jar"/>
+    <library name="s2-geometry-library-java-1.0.0.jar"/>
+    <library name="snakeyaml-1.26.jar"/>
+    <library name="spatial4j-0.7.jar"/>
   </runtime>
   <requires>
     <import plugin="nutch-extensionpoints" />
diff --git a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt
new file mode 100644
index 0000000000..0725900445
--- /dev/null
+++ b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt
@@ -0,0 +1,33 @@
+1. Upgrade OpenSearch dependency in src/plugin/indexer-opensearch-1x/ivy.xml
+
+2. Upgrade the OpenSearch specific dependencies in src/plugin/indexer-opensearch-1x/plugin.xml
+   To get the list of dependencies and their versions execute:
+    $ cd src/plugin/indexer-opensearch-1x/
+    $ ant -f ./build-ivy.xml
+    $ ls lib | sed 's/^/    <library name="/g' | sed 's/$/"\/>/g'
+
+   In the plugin.xml replace all lines between
+      <!-- OpenSearch Rest Client dependencies -->
+   and
+      <!-- end of OpenSearch Rest Client dependencies -->
+   with the output of the command above.
+
+4. (Optionally) remove overlapping dependencies between indexer-opensearch-1x and Nutch core dependencies:
+   - check for libs present both in
+       build/lib
+     and
+       build/plugins/indexer-opensearch-1x/
+     (eventually with different versions)
+   - duplicated libs can be added to the exclusions of transitive dependencies in
+       build/plugins/indexer-opensearch-1x/ivy.xml
+   - but it should be made sure that the library versions in ivy/ivy.xml correspend to
+     those required by Tika
+
+5. Remove the locally "installed" dependencies in src/plugin/indexer-opensearch-1x/lib/:
+
+    $ rm -rf lib/
+
+6. Build Nutch and run all unit tests:
+
+    $ cd ../../../
+    $ ant clean runtime test
\ No newline at end of file
diff --git a/src/plugin/indexer-opensearch-1x/ivy.xml b/src/plugin/indexer-opensearch-1x/ivy.xml
index 1505ad3c82..ae5d91e41e 100644
--- a/src/plugin/indexer-opensearch-1x/ivy.xml
+++ b/src/plugin/indexer-opensearch-1x/ivy.xml
@@ -40,7 +40,22 @@
       <exclude org="commons-logging" name="commons-logging" />
       <exclude org="com.tdunning" name="t-digest" />
       <exclude org="org.apache.logging.log4j" name="log4j-api" />
+      <exclude org="org.apache.lucene" name="*" />
     </dependency>
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-backward-codecs" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-core" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-grouping" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-highlighter" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-join" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-memory" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-misc" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-queries" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-queryparser" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-sandbox" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-spatial-extras" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-spatial3d" rev="8.11.2" />
+    <dependency org="org.apache.lucene" name="lucene-suggest" rev="8.11.2" />
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/indexer-opensearch-1x/plugin.xml b/src/plugin/indexer-opensearch-1x/plugin.xml
index e1dde463dd..ee0d45dc2a 100644
--- a/src/plugin/indexer-opensearch-1x/plugin.xml
+++ b/src/plugin/indexer-opensearch-1x/plugin.xml
@@ -22,50 +22,50 @@
     </library>
     <!-- OpenSearch Rest Client Dependencies -->
     <!-- end of OpenSearch Rest Client dependencies -->
-    <library name="HdrHistogram-2.1.9.jar" />
-    <library name="aggs-matrix-stats-client-1.3.8.jar" />
-    <library name="compiler-0.9.10.jar" />
-    <library name="opensearch-1.3.8.jar" />
-    <library name="opensearch-cli-1.3.8.jar" />
-    <library name="opensearch-core-1.3.8.jar" />
-    <library name="opensearch-geo-1.3.8.jar" />
-    <library name="opensearch-plugin-classloader-1.3.8.jar" />
-    <library name="opensearch-rest-client-1.3.8.jar" />
-    <library name="opensearch-rest-high-level-client-1.3.8.jar" />
-    <library name="opensearch-secure-sm-1.3.8.jar" />
-    <library name="opensearch-x-content-1.3.8.jar" />
-    <library name="hppc-0.8.1.jar" />
-    <library name="httpasyncclient-4.1.4.jar" />
-    <library name="httpclient-4.5.13.jar" />
-    <library name="httpcore-4.4.13.jar" />
-    <library name="httpcore-nio-4.4.14.jar" />
-    <library name="jackson-core-2.12.7.jar" />
-    <library name="jackson-dataformat-cbor-2.12.7.jar" />
-    <library name="jackson-dataformat-smile-2.12.7.jar" />
-    <library name="jackson-dataformat-yaml-2.12.7.jar" />
-    <library name="jna-5.5.0.jar" />
-    <library name="joda-time-2.10.12.jar" />
-    <library name="jopt-simple-5.0.4.jar" />
-    <library name="lang-mustache-client-1.3.8.jar" />
-    <library name="log4j-api-2.17.1.jar" />
-    <library name="lucene-analyzers-common-8.11.2.jar" />
-    <library name="lucene-backward-codecs-8.11.2.jar" />
-    <library name="lucene-core-8.11.2.jar" />
-    <library name="lucene-grouping-8.11.2.jar" />
-    <library name="lucene-highlighter-8.11.2.jar" />
-    <library name="lucene-join-8.11.2.jar" />
-    <library name="lucene-memory-8.11.2.jar" />
-    <library name="lucene-misc-8.11.2.jar" />
-    <library name="lucene-queries-8.11.2.jar" />
-    <library name="lucene-queryparser-8.11.2.jar" />
-    <library name="lucene-sandbox-8.11.2.jar" />
-    <library name="lucene-spatial-extras-8.11.2.jar" />
-    <library name="lucene-spatial3d-8.11.2.jar" />
-    <library name="lucene-suggest-8.11.2.jar" />
-    <library name="mapper-extras-client-1.3.8.jar" />
-    <library name="parent-join-client-1.3.8.jar" />
-    <library name="rank-eval-client-1.3.8.jar" />
-    <library name="snakeyaml-1.32.jar" />
+    <library name="HdrHistogram-2.1.9.jar"/>
+    <library name="aggs-matrix-stats-client-1.3.8.jar"/>
+    <library name="compiler-0.9.10.jar"/>
+    <library name="hppc-0.8.1.jar"/>
+    <library name="httpasyncclient-4.1.4.jar"/>
+    <library name="httpclient-4.5.13.jar"/>
+    <library name="httpcore-4.4.12.jar"/>
+    <library name="httpcore-nio-4.4.12.jar"/>
+    <library name="jackson-core-2.14.1.jar"/>
+    <library name="jackson-dataformat-cbor-2.14.1.jar"/>
+    <library name="jackson-dataformat-smile-2.14.1.jar"/>
+    <library name="jackson-dataformat-yaml-2.14.1.jar"/>
+    <library name="jna-5.5.0.jar"/>
+    <library name="joda-time-2.10.12.jar"/>
+    <library name="jopt-simple-5.0.4.jar"/>
+    <library name="lang-mustache-client-1.3.8.jar"/>
+    <library name="lucene-analyzers-common-8.11.2.jar"/>
+    <library name="lucene-backward-codecs-8.11.2.jar"/>
+    <library name="lucene-core-8.11.2.jar"/>
+    <library name="lucene-grouping-8.11.2.jar"/>
+    <library name="lucene-highlighter-8.11.2.jar"/>
+    <library name="lucene-join-8.11.2.jar"/>
+    <library name="lucene-memory-8.11.2.jar"/>
+    <library name="lucene-misc-8.11.2.jar"/>
+    <library name="lucene-queries-8.11.2.jar"/>
+    <library name="lucene-queryparser-8.11.2.jar"/>
+    <library name="lucene-sandbox-8.11.2.jar"/>
+    <library name="lucene-spatial-extras-8.11.2.jar"/>
+    <library name="lucene-spatial3d-8.11.2.jar"/>
+    <library name="lucene-suggest-8.11.2.jar"/>
+    <library name="mapper-extras-client-1.3.8.jar"/>
+    <library name="opensearch-1.3.8.jar"/>
+    <library name="opensearch-cli-1.3.8.jar"/>
+    <library name="opensearch-core-1.3.8.jar"/>
+    <library name="opensearch-geo-1.3.8.jar"/>
+    <library name="opensearch-rest-client-1.3.8.jar"/>
+    <library name="opensearch-rest-high-level-client-1.3.8.jar"/>
+    <library name="opensearch-secure-sm-1.3.8.jar"/>
+    <library name="opensearch-x-content-1.3.8.jar"/>
+    <library name="parent-join-client-1.3.8.jar"/>
+    <library name="rank-eval-client-1.3.8.jar"/>
+    <library name="s2-geometry-library-java-1.0.0.jar"/>
+    <library name="snakeyaml-1.32.jar"/>
+    <library name="spatial4j-0.7.jar"/>
   </runtime>
   <requires>
     <import plugin="nutch-extensionpoints" />

From daedbc36ceeba506795973b75ead2f5b4b59ddd9 Mon Sep 17 00:00:00 2001
From: tballison <tallison@apache.org>
Date: Thu, 31 Aug 2023 09:07:29 -0400
Subject: [PATCH 05/28] NUTCH-2978 -- exclude reload4j and update
 LICENSE-binary and NOTICE-binary.

---
 LICENSE-binary |  2 --
 NOTICE-binary  |  8 --------
 ivy/ivy.xml    | 17 +++++++++++++----
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/LICENSE-binary b/LICENSE-binary
index 8e24a728e2..a0e05cbd1e 100644
--- a/LICENSE-binary
+++ b/LICENSE-binary
@@ -208,7 +208,6 @@ This product bundles some components that are also licensed under
 the Apache License Version 2.0:
 
 
-ch.qos.reload4j:reload4j
 com.101tec:zkclient
 com.amazonaws:aws-java-sdk-cloudsearch
 com.amazonaws:aws-java-sdk-core
@@ -758,7 +757,6 @@ org.jsoup:jsoup
 org.rypt:f8
 org.slf4j:jcl-over-slf4j
 org.slf4j:slf4j-api
-org.slf4j:slf4j-reload4j
 
 
 Mozilla Public License 1.1 (MPL 1.1)
diff --git a/NOTICE-binary b/NOTICE-binary
index 1aab2cb411..97e5b7d12f 100644
--- a/NOTICE-binary
+++ b/NOTICE-binary
@@ -163,10 +163,6 @@ AOP alliance (http://aopalliance.sourceforge.net)
 - license: Public Domain
   (licenses-binary/LICENSE-public-domain.txt)
 
-# ch.qos.reload4j:reload4j
-reload4j (https://reload4j.qos.ch)
-- license: The Apache Software License, Version 2.0
-
 # com.101tec:zkclient
 ZkClient (https://github.com/sgroschupf/zkclient)
 - license: The Apache Software License, Version 2.0
@@ -1100,10 +1096,6 @@ JCL 1.2 implemented over SLF4J (http://www.slf4j.org)
   (licenses-binary/LICENSE-mit-license.txt)
 # org.slf4j:slf4j-api
 SLF4J API Module (http://www.slf4j.org)
-- license: MIT License
-  (licenses-binary/LICENSE-mit-license.txt)
-# org.slf4j:slf4j-reload4j
-SLF4J Reload4j Binding (http://reload4j.qos.ch)
 - license: MIT License
   (licenses-binary/LICENSE-mit-license.txt)
 
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 6366b891ce..9e19cec33d 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -50,15 +50,21 @@
 		<dependency org="com.tdunning" name="t-digest" rev="3.3" />
 
 		<!-- Hadoop Dependencies -->
-		<dependency org="org.apache.hadoop" name="hadoop-common" rev="3.3.4" conf="*->default" />
+		<dependency org="org.apache.hadoop" name="hadoop-common" rev="3.3.4" conf="*->default">
+			<exclude org="ch.qos.reload4j" name="*"/>
+			<exclude org="org.slf4j" name="*" />
+		</dependency>
 		<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="3.3.4" conf="*->default">
+			<exclude org="ch.qos.reload4j" name="*"/>
 			<exclude org="org.slf4j" name="*" />
 		</dependency>
 		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="3.3.4" conf="*->default">
-		  <exclude org="org.slf4j" name="*" />
+			<exclude org="ch.qos.reload4j" name="*"/>
+			<exclude org="org.slf4j" name="*" />
 		</dependency>
 		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.3.4" conf="*->default">
-		  <exclude org="org.slf4j" name="*" />
+			<exclude org="ch.qos.reload4j" name="*"/>
+			<exclude org="org.slf4j" name="*" />
 		</dependency><!-- End of Hadoop Dependencies -->
 
 		<dependency org="org.apache.tika" name="tika-core" rev="2.3.0" />
@@ -111,7 +117,10 @@
 		</dependency>
 
 		<!-- Jetty used to serve test pages for unit tests, but is also provided as dependency of Hadoop -->
-		<dependency org="org.eclipse.jetty" name="jetty-server" rev="9.4.50.v20221201" conf="test->default" />
+		<dependency org="org.eclipse.jetty" name="jetty-server" rev="9.4.50.v20221201" conf="test->default">
+			<exclude org="ch.qos.reload4j" module="*" />
+			<exclude org="org.slf4j" module="slf4j-reload" />
+		</dependency>
 
 		<!--Added Because of Elasticsearch JEST client-->
 		<!--TODO refactor these to indexer-elastic-rest plugin somehow, currently doesn't resolve correctly-->

From 820d129a8adff9a34eed2ed3c04cfee377b56b63 Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Wed, 13 Sep 2023 10:26:25 -0400
Subject: [PATCH 06/28] NUTCH-3000 - the selenium protocol should return the
 full html, not just the inner body element.

---
 .../apache/nutch/protocol/selenium/HttpWebClient.java    | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
index 4b998d1bc8..b0b12004da 100644
--- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -234,7 +234,7 @@ public static void cleanUpDriver(WebDriver driver) {
   }
 
   /**
-   * Function for obtaining the HTML BODY using the selected <a href=
+   * Function for obtaining the HTML using the selected <a href=
    * 'https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium
    * webdriver</a> There are a number of configuration properties within
    * <code>nutch-site.xml</code> which determine whether to take screenshots of
@@ -244,7 +244,7 @@ public static void cleanUpDriver(WebDriver driver) {
    *          the URL to fetch and render
    * @param conf
    *          the {@link org.apache.hadoop.conf.Configuration}
-   * @return the rendered inner HTML page
+   * @return the html page
    */
   public static String getHtmlPage(String url, Configuration conf) {
     WebDriver driver = getDriverForPage(url, conf);
@@ -253,10 +253,7 @@ public static String getHtmlPage(String url, Configuration conf) {
       if (conf.getBoolean("take.screenshot", false)) {
         takeScreenshot(driver, conf);
       }
-
-      String innerHtml = driver.findElement(By.tagName("body"))
-          .getAttribute("innerHTML");
-      return innerHtml;
+      return driver.getPageSource();
 
       // I'm sure this catch statement is a code smell ; borrowing it from
       // lib-htmlunit

From b6f645a4d025fa136f557dd37e9aba611b425fbb Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Wed, 13 Sep 2023 10:37:17 -0400
Subject: [PATCH 07/28] NUTCH-3001 - fix logic for grabbing bytes if there's no
 content type in the header

---
 .../nutch/protocol/selenium/HttpResponse.java | 78 +++++++++----------
 1 file changed, 37 insertions(+), 41 deletions(-)

diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
index bb3bf6357c..7506773748 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -269,55 +269,51 @@ public HttpResponse(Http http, URL url, CrawlDatum datum)
       String contentType = getHeader(Response.CONTENT_TYPE);
 
       // handle with Selenium only if content type in HTML or XHTML
-      if (contentType != null) {
-        if (contentType.contains("text/html")
-            || contentType.contains("application/xhtml")) {
-          readPlainContent(url);
-        } else {
-          try {
-            int contentLength = Integer.MAX_VALUE;
-            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
-            if (contentLengthString != null) {
-              try {
-                contentLength = Integer.parseInt(contentLengthString.trim());
-              } catch (NumberFormatException ex) {
-                throw new HttpException(
-                    "bad content length: " + contentLengthString);
-              }
+      if (contentType != null &&
+              (contentType.contains("text/html") || contentType.contains("application/xhtml"))) {
+        readPlainContent(url);
+      } else {
+        try {
+          int contentLength = Integer.MAX_VALUE;
+          String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+          if (contentLengthString != null) {
+            try {
+              contentLength = Integer.parseInt(contentLengthString.trim());
+            } catch (NumberFormatException ex) {
+              throw new HttpException("bad content length: " + contentLengthString);
             }
+          }
 
-            if (http.getMaxContent() >= 0
-                && contentLength > http.getMaxContent()) {
-              contentLength = http.getMaxContent();
-            }
+          if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+            contentLength = http.getMaxContent();
+          }
 
-            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
-            int bufferFilled = 0;
-            int totalRead = 0;
-            ByteArrayOutputStream out = new ByteArrayOutputStream();
-            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
-                && totalRead + bufferFilled <= contentLength) {
-              totalRead += bufferFilled;
-              out.write(buffer, 0, bufferFilled);
-            }
+          byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+          int bufferFilled = 0;
+          int totalRead = 0;
+          ByteArrayOutputStream out = new ByteArrayOutputStream();
+          while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 &&
+                  totalRead + bufferFilled <= contentLength) {
+            totalRead += bufferFilled;
+            out.write(buffer, 0, bufferFilled);
+          }
 
-            content = out.toByteArray();
+          content = out.toByteArray();
 
-          } catch (Exception e) {
-            if (code == 200)
-              throw new IOException(e.toString());
-            // for codes other than 200 OK, we are fine with empty content
-          } finally {
-            if (in != null) {
-              in.close();
-            }
+        } catch (Exception e) {
+          if (code == 200) {
+            throw new IOException(e.toString());
+          }
+          // for codes other than 200 OK, we are fine with empty content
+        } finally {
+          if (in != null) {
+            in.close();
           }
-        }
-        if (httpHeaders != null) {
-          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
         }
       }
-
+      if (httpHeaders != null) {
+        headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+      }
     } catch(KeyManagementException | NoSuchAlgorithmException | KeyStoreException e) {
         throw new ProtocolException(e);
     } finally {

From 8a5ef498c5d930c367ca72547d07d4aaa2d55254 Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Wed, 13 Sep 2023 11:03:09 -0400
Subject: [PATCH 08/28] Remove Any23 from Nutch

---
 LICENSE-binary                                |    5 -
 NOTICE-binary                                 |    5 +-
 NOTICE.txt                                    |    2 +-
 build.xml                                     |    5 -
 conf/nutch-default.xml                        |   16 -
 default.properties                            |    4 +-
 src/plugin/any23/build-ivy.xml                |   47 -
 src/plugin/any23/build.xml                    |   36 -
 src/plugin/any23/howto_upgrade_any23.txt      |   22 -
 src/plugin/any23/ivy.xml                      |   49 -
 src/plugin/any23/plugin.xml                   |  216 -
 .../any23/sample/BBC_News_Scotland.html       | 3780 -----------------
 src/plugin/any23/sample/microdata_basic.html  |  107 -
 .../nutch/any23/Any23IndexingFilter.java      |  117 -
 .../apache/nutch/any23/Any23ParseFilter.java  |  171 -
 .../org/apache/nutch/any23/package-info.java  |   24 -
 .../nutch/any23/TestAny23IndexingFilter.java  |   81 -
 .../nutch/any23/TestAny23ParseFilter.java     |  119 -
 src/plugin/build.xml                          |    3 -
 src/plugin/parse-tika/howto_upgrade_tika.txt  |    2 -
 20 files changed, 3 insertions(+), 4808 deletions(-)
 delete mode 100644 src/plugin/any23/build-ivy.xml
 delete mode 100644 src/plugin/any23/build.xml
 delete mode 100644 src/plugin/any23/howto_upgrade_any23.txt
 delete mode 100644 src/plugin/any23/ivy.xml
 delete mode 100644 src/plugin/any23/plugin.xml
 delete mode 100644 src/plugin/any23/sample/BBC_News_Scotland.html
 delete mode 100644 src/plugin/any23/sample/microdata_basic.html
 delete mode 100644 src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
 delete mode 100644 src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
 delete mode 100644 src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java
 delete mode 100644 src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java
 delete mode 100644 src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java

diff --git a/LICENSE-binary b/LICENSE-binary
index 8e24a728e2..5e11ec79c9 100644
--- a/LICENSE-binary
+++ b/LICENSE-binary
@@ -327,11 +327,6 @@ net.sourceforge.owlapi:owlapi-impl
 net.sourceforge.owlapi:owlapi-parsers
 net.sourceforge.owlapi:owlapi-rio
 net.sourceforge.owlapi:owlapi-tools
-org.apache.any23:apache-any23-api
-org.apache.any23:apache-any23-core
-org.apache.any23:apache-any23-csvutils
-org.apache.any23:apache-any23-encoding
-org.apache.any23:apache-any23-mime
 org.apache.avro:avro
 org.apache.commons:commons-collections4
 org.apache.commons:commons-compress
diff --git a/NOTICE-binary b/NOTICE-binary
index 1aab2cb411..61a5c1d0d2 100644
--- a/NOTICE-binary
+++ b/NOTICE-binary
@@ -29,7 +29,7 @@ code and source code.
 
 The following provides more details on the included cryptographic software:
 
-The plugins parse-tika and any23 use Apache Tika and the Bouncy Castle
+The parse-tika plugin uses Apache Tika and the Bouncy Castle
 generic encryption libraries for extracting text content and metadata
 from encrypted PDF files. See <https://www.bouncycastle.org/> for more
 details on Bouncy Castle and <https://tika.apache.org/> for details
@@ -46,9 +46,6 @@ on Apache Tika.
 Apache projects
 ---------------
 
-Apache Any23 (https://any23.apache.org/)
-  see https://github.com/apache/any23/blob/master/NOTICE.txt
-
 Apache Avro (https://avro.apache.org)
   see https://github.com/apache/avro/blob/master/NOTICE.txt
 
diff --git a/NOTICE.txt b/NOTICE.txt
index 939ddc8031..4fdd968ab0 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -29,7 +29,7 @@ code and source code.
 
 The following provides more details on the included cryptographic software:
 
-The plugins parse-tika and any23 use Apache Tika and the Bouncy Castle
+The parse-tika plugin uses Apache Tika and the Bouncy Castle
 generic encryption libraries for extracting text content and metadata
 from encrypted PDF files. See <https://www.bouncycastle.org/> for more
 details on Bouncy Castle and <https://tika.apache.org/> for details
diff --git a/build.xml b/build.xml
index 9326a8ba21..b44581405a 100644
--- a/build.xml
+++ b/build.xml
@@ -202,7 +202,6 @@
       <arg value="--no-module-directories" if:set="using.jdk.11"/>
 
       <packageset dir="${src.dir}"/>
-      <packageset dir="${plugins.dir}/any23/src/java/"/>
       <packageset dir="${plugins.dir}/creativecommons/src/java"/>
       <packageset dir="${plugins.dir}/feed/src/java"/>
       <packageset dir="${plugins.dir}/headings/src/java"/>
@@ -687,7 +686,6 @@
       <arg value="--no-module-directories" if:set="using.jdk.11"/>
 
       <packageset dir="${src.dir}"/>
-      <packageset dir="${plugins.dir}/any23/src/java/" />
       <packageset dir="${plugins.dir}/creativecommons/src/java"/>
       <packageset dir="${plugins.dir}/feed/src/java"/>
       <packageset dir="${plugins.dir}/headings/src/java"/>
@@ -772,7 +770,6 @@
       <classpath>
         <fileset dir="${build.plugins}" >
           <include name="**/*.jar"/>
-          <exclude name="any23/javax.annotation-api*.jar"/>
         </fileset>
       </classpath>
 
@@ -1180,8 +1177,6 @@
         <source path="${basedir}/src/java/" />
         <source path="${basedir}/src/test/" output="build/test/classes" />
 
-        <source path="${plugins.dir}/any23/src/java/" />
-        <source path="${plugins.dir}/any23/src/test/" />
         <source path="${plugins.dir}/creativecommons/src/java/" />
         <source path="${plugins.dir}/creativecommons/src/test/" />
         <source path="${plugins.dir}/feed/src/java/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index e98bd55708..58455b338c 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1353,22 +1353,6 @@
   </description>
 </property>
 
-
-
-<!--  any23 plugin properties -->
-
-<property>
-    <name>any23.extractors</name>
-    <value>html-microdata</value>
-    <description>Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)</description>
-</property>
-
-<property>
-    <name>any23.content_types</name>
-    <value>text/html,application/xhtml+xml</value>
-    <description>Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported.</description>
-</property>
-
 <!-- moreindexingfilter plugin properties -->
 
 <property>
diff --git a/default.properties b/default.properties
index df96199c1e..17e0bffbbc 100644
--- a/default.properties
+++ b/default.properties
@@ -210,6 +210,4 @@ plugins.misc=\
    org.apache.nutch.collection*:\
    org.apache.nutch.analysis.lang*:\
    org.creativecommons.nutch*:\
-   org.apache.nutch.microformats.reltag*:\
-   org.apache.nutch.any23*
-   
+   org.apache.nutch.microformats.reltag*:   
diff --git a/src/plugin/any23/build-ivy.xml b/src/plugin/any23/build-ivy.xml
deleted file mode 100644
index 6c7c6b906b..0000000000
--- a/src/plugin/any23/build-ivy.xml
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="any23" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
-  <property name="ivy.dir" value="../../../ivy" />
-  <property file="../../../default.properties" />
-
-  <target name="download-ivy" unless="offline">
-    <!-- download Ivy from web site so that it can be used even without any special installation -->
-    <available file="${ivy.jar}" property="ivy.jar.found"/>
-    <antcall target="ivy-download-unchecked"/>
-  </target>
-
-  <target name="ivy-download-unchecked" unless="ivy.jar.found" description="--> fetch any ivy file">
-    <get src="${ivy.repo.url}" dest="${ivy.jar}" usetimestamp="true" />
-  </target>
-
-  <target name="init-ivy" depends="download-ivy">
-    <!-- try to load ivy here from ivy home, in case the user has not already dropped
-         it into ant's lib dir (note that the latter copy will always take precedence).
-         We will not fail as long as local lib dir exists (it may be empty) and
-         ivy is in at least one of ant's lib dir or the local lib dir. -->
-    <taskdef resource="org/apache/ivy/ant/antlib.xml"
-             uri="antlib:org.apache.ivy.ant" classpath="${ivy.jar}"/>
-    <ivy:settings file="${ivy.dir}/ivysettings.xml" />
-  </target>
-
-  <target name="deps-jar" depends="init-ivy">
-    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
-  </target>
-
-</project>
diff --git a/src/plugin/any23/build.xml b/src/plugin/any23/build.xml
deleted file mode 100644
index 790b18548d..0000000000
--- a/src/plugin/any23/build.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="any23" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
-  </target>
-
-  <!-- for junit test -->
-  <mkdir dir="${build.test}/data"/>
-  <copy todir="${build.test}/data">
-    <fileset dir="sample">
-      <include name="*.html"/>
-    </fileset>
-  </copy>
-
-</project>
diff --git a/src/plugin/any23/howto_upgrade_any23.txt b/src/plugin/any23/howto_upgrade_any23.txt
deleted file mode 100644
index 32f9162f41..0000000000
--- a/src/plugin/any23/howto_upgrade_any23.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-1. Upgrade Any23 dependency in src/plugin/any23/ivy.xml
-
-2. Upgrade Any23's own dependencies in src/plugin/any23/plugin.xml
-   To get the list of dependencies and their versions execute:
-    $ cd src/plugin/any23/
-    $ ant -f ./build-ivy.xml
-    $ ls lib | sed 's/^/      <library name="/g' | sed 's/$/"\/>/g'
-
-   In the plugin.xml replace all lines between
-      <!-- Begin Any23 dependencies -->
-   and
-      <!-- End Any23 dependencies -->
-   with the output of the command above.
-
-3. Remove the locally "installed" dependencies in src/plugin/any23/lib/:
-
-    $ rm -rf lib/
-
-4. Build Nutch and run all unit tests:
-
-    $ cd ../../../
-    $ ant clean runtime test
diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml
deleted file mode 100644
index 3b755ee3fa..0000000000
--- a/src/plugin/any23/ivy.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../../ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-    <dependency org="org.apache.any23" name="apache-any23-core" rev="2.7" conf="*->default">
-      <exclude org="org.apache.commons" name="commons-lang" />
-      <exclude org="org.apache.commons" name="commons-compress" />
-      <exclude org="org.slf4j" name="slf4j-log4j12" />
-      <exclude org="org.slf4j" name="slf4j-api" />
-      <exclude org="xerces" />
-    </dependency>
-    <dependency org="org.apache.commons" name="commons-rdf-api" rev="0.5.0" conf="*->default"/>
-  </dependencies>
-
-</ivy-module>
diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml
deleted file mode 100644
index dae8c47aa3..0000000000
--- a/src/plugin/any23/plugin.xml
+++ /dev/null
@@ -1,216 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-  id="any23"
-  name="Apache Any23 Parser/Indexer"
-  version="1.0.0"
-  provider-name="nutch.org">
-
-  <runtime>
-    <library name="any23.jar">
-      <export name="*"/>
-    </library>
-      <!-- Begin Any23 dependencies -->
-      <library name="apache-any23-api-2.7.jar"/>
-      <library name="apache-any23-core-2.7.jar"/>
-      <library name="apache-any23-csvutils-2.7.jar"/>
-      <library name="apache-any23-encoding-2.7.jar"/>
-      <library name="apache-any23-mime-2.7.jar"/>
-      <library name="apache-mime4j-core-0.8.4.jar"/>
-      <library name="apache-mime4j-dom-0.8.4.jar"/>
-      <library name="asm-9.2.jar"/>
-      <library name="bcmail-jdk15on-1.70.jar"/>
-      <library name="bcpkix-jdk15on-1.70.jar"/>
-      <library name="bcutil-jdk15on-1.70.jar"/>
-      <library name="biweekly-0.6.6.jar"/>
-      <library name="boilerpipe-1.1.0.jar"/>
-      <library name="caffeine-2.8.1.jar"/>
-      <library name="checker-qual-3.1.0.jar"/>
-      <library name="commons-codec-1.15.jar"/>
-      <library name="commons-collections4-4.4.jar"/>
-      <library name="commons-csv-1.9.0.jar"/>
-      <library name="commons-exec-1.3.jar"/>
-      <library name="commons-io-2.11.0.jar"/>
-      <library name="commons-lang3-3.12.0.jar"/>
-      <library name="commons-logging-1.2.jar"/>
-      <library name="commons-math3-3.6.1.jar"/>
-      <library name="commons-rdf-api-0.5.0.jar"/>
-      <library name="commons-text-1.9.jar"/>
-      <library name="curvesapi-1.06.jar"/>
-      <library name="dd-plist-1.23.jar"/>
-      <library name="dec-0.1.2.jar"/>
-      <library name="error_prone_annotations-2.3.4.jar"/>
-      <library name="f8-1.1.jar"/>
-      <library name="failureaccess-1.0.1.jar"/>
-      <library name="fluent-hc-4.5.13.jar"/>
-      <library name="fontbox-2.0.25.jar"/>
-      <library name="guava-30.1.1-jre.jar"/>
-      <library name="hppcrt-0.7.5.jar"/>
-      <library name="httpclient-4.5.13.jar"/>
-      <library name="httpclient-cache-4.5.13.jar"/>
-      <library name="httpclient-osgi-4.5.13.jar"/>
-      <library name="httpcore-4.4.15.jar"/>
-      <library name="httpcore-nio-4.4.14.jar"/>
-      <library name="httpcore-osgi-4.4.14.jar"/>
-      <library name="httpmime-4.5.13.jar"/>
-      <library name="istack-commons-runtime-3.0.12.jar"/>
-      <library name="jackcess-4.0.1.jar"/>
-      <library name="jackcess-encrypt-4.0.1.jar"/>
-      <library name="jackson-annotations-2.11.4.jar"/>
-      <library name="jackson-core-2.12.1.jar"/>
-      <library name="jackson-databind-2.11.4.jar"/>
-      <library name="jai-imageio-core-1.4.0.jar"/>
-      <library name="jakarta.activation-1.2.2.jar"/>
-      <library name="jakarta.xml.bind-api-2.3.3.jar"/>
-      <library name="java-libpst-0.9.3.jar"/>
-      <library name="javax.activation-api-1.2.0.jar"/>
-      <library name="javax.inject-1.jar"/>
-      <library name="jaxb-api-2.3.1.jar"/>
-      <library name="jaxb-runtime-2.3.5.jar"/>
-      <library name="jbig2-imageio-3.0.3.jar"/>
-      <library name="jcl-over-slf4j-1.7.35.jar"/>
-      <library name="jcommander-1.82.jar"/>
-      <library name="jdom2-2.0.6.1.jar"/>
-      <library name="jempbox-1.8.16.jar"/>
-      <library name="jhighlight-1.0.3.jar"/>
-      <library name="jmatio-1.5.jar"/>
-      <library name="jsonld-java-0.13.4.jar"/>
-      <library name="jsoup-1.14.3.jar"/>
-      <library name="jsr305-3.0.2.jar"/>
-      <library name="juniversalchardet-1.0.3.jar"/>
-      <library name="junrar-7.4.1.jar"/>
-      <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
-      <library name="log4j-api-2.17.1.jar"/>
-      <library name="log4j-core-2.17.1.jar"/>
-      <library name="log4j-slf4j-impl-2.17.1.jar"/>
-      <library name="mapdb-1.0.8.jar"/>
-      <library name="metadata-extractor-2.16.0.jar"/>
-      <library name="owlapi-api-5.1.20.jar"/>
-      <library name="owlapi-apibinding-5.1.20.jar"/>
-      <library name="owlapi-impl-5.1.20.jar"/>
-      <library name="owlapi-oboformat-5.1.20.jar"/>
-      <library name="owlapi-parsers-5.1.20.jar"/>
-      <library name="owlapi-rio-5.1.20.jar"/>
-      <library name="owlapi-tools-5.1.20.jar"/>
-      <library name="parso-2.0.14.jar"/>
-      <library name="pdfbox-2.0.25.jar"/>
-      <library name="pdfbox-debugger-2.0.25.jar"/>
-      <library name="pdfbox-tools-2.0.25.jar"/>
-      <library name="poi-5.2.0.jar"/>
-      <library name="poi-ooxml-5.2.0.jar"/>
-      <library name="poi-ooxml-lite-5.2.0.jar"/>
-      <library name="poi-scratchpad-5.2.0.jar"/>
-      <library name="rdf4j-http-client-3.7.4.jar"/>
-      <library name="rdf4j-http-protocol-3.7.4.jar"/>
-      <library name="rdf4j-model-3.7.4.jar"/>
-      <library name="rdf4j-model-api-3.7.4.jar"/>
-      <library name="rdf4j-model-vocabulary-3.7.4.jar"/>
-      <library name="rdf4j-query-3.7.4.jar"/>
-      <library name="rdf4j-queryalgebra-evaluation-3.7.4.jar"/>
-      <library name="rdf4j-queryalgebra-model-3.7.4.jar"/>
-      <library name="rdf4j-queryparser-api-3.7.4.jar"/>
-      <library name="rdf4j-queryparser-sparql-3.7.4.jar"/>
-      <library name="rdf4j-queryresultio-api-3.7.4.jar"/>
-      <library name="rdf4j-queryresultio-binary-3.7.4.jar"/>
-      <library name="rdf4j-queryresultio-sparqlxml-3.7.4.jar"/>
-      <library name="rdf4j-repository-api-3.7.4.jar"/>
-      <library name="rdf4j-repository-sail-3.7.4.jar"/>
-      <library name="rdf4j-repository-sparql-3.7.4.jar"/>
-      <library name="rdf4j-rio-api-3.7.4.jar"/>
-      <library name="rdf4j-rio-binary-3.7.4.jar"/>
-      <library name="rdf4j-rio-datatypes-3.7.4.jar"/>
-      <library name="rdf4j-rio-hdt-3.7.4.jar"/>
-      <library name="rdf4j-rio-jsonld-3.7.4.jar"/>
-      <library name="rdf4j-rio-languages-3.7.4.jar"/>
-      <library name="rdf4j-rio-n3-3.7.4.jar"/>
-      <library name="rdf4j-rio-nquads-3.7.4.jar"/>
-      <library name="rdf4j-rio-ntriples-3.7.4.jar"/>
-      <library name="rdf4j-rio-rdfjson-3.7.4.jar"/>
-      <library name="rdf4j-rio-rdfxml-3.7.4.jar"/>
-      <library name="rdf4j-rio-trig-3.7.4.jar"/>
-      <library name="rdf4j-rio-trix-3.7.4.jar"/>
-      <library name="rdf4j-rio-turtle-3.7.4.jar"/>
-      <library name="rdf4j-sail-api-3.7.4.jar"/>
-      <library name="rdf4j-sail-base-3.7.4.jar"/>
-      <library name="rdf4j-sail-memory-3.7.4.jar"/>
-      <library name="rdf4j-util-3.7.4.jar"/>
-      <library name="rome-1.18.0.jar"/>
-      <library name="rome-utils-1.18.0.jar"/>
-      <library name="semargl-core-0.7.jar"/>
-      <library name="semargl-rdf-0.7.jar"/>
-      <library name="semargl-rdf4j-0.7.jar"/>
-      <library name="semargl-rdfa-0.7.jar"/>
-      <library name="snakeyaml-1.30.jar"/>
-      <library name="SparseBitSet-1.2.jar"/>
-      <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-core-2.3.0.jar"/>
-      <library name="tika-parser-apple-module-2.3.0.jar"/>
-      <library name="tika-parser-audiovideo-module-2.3.0.jar"/>
-      <library name="tika-parser-cad-module-2.3.0.jar"/>
-      <library name="tika-parser-code-module-2.3.0.jar"/>
-      <library name="tika-parser-crypto-module-2.3.0.jar"/>
-      <library name="tika-parser-digest-commons-2.3.0.jar"/>
-      <library name="tika-parser-font-module-2.3.0.jar"/>
-      <library name="tika-parser-html-commons-2.3.0.jar"/>
-      <library name="tika-parser-html-module-2.3.0.jar"/>
-      <library name="tika-parser-image-module-2.3.0.jar"/>
-      <library name="tika-parser-mail-commons-2.3.0.jar"/>
-      <library name="tika-parser-mail-module-2.3.0.jar"/>
-      <library name="tika-parser-microsoft-module-2.3.0.jar"/>
-      <library name="tika-parser-miscoffice-module-2.3.0.jar"/>
-      <library name="tika-parser-news-module-2.3.0.jar"/>
-      <library name="tika-parser-ocr-module-2.3.0.jar"/>
-      <library name="tika-parser-pdf-module-2.3.0.jar"/>
-      <library name="tika-parser-pkg-module-2.3.0.jar"/>
-      <library name="tika-parsers-standard-package-2.3.0.jar"/>
-      <library name="tika-parser-text-module-2.3.0.jar"/>
-      <library name="tika-parser-xml-module-2.3.0.jar"/>
-      <library name="tika-parser-xmp-commons-2.3.0.jar"/>
-      <library name="tika-parser-zip-commons-2.3.0.jar"/>
-      <library name="txw2-2.3.5.jar"/>
-      <library name="vinnie-2.0.2.jar"/>
-      <library name="vorbis-java-core-0.8.jar"/>
-      <library name="vorbis-java-tika-0.8.jar"/>
-      <library name="xmlbeans-5.0.3.jar"/>
-      <library name="xmpbox-2.0.25.jar"/>
-      <library name="xmpcore-6.1.11.jar"/>
-      <library name="xz-1.9.jar"/>
-      <!-- End Any23 dependencies -->
-  </runtime>
-
-  <requires>
-    <import plugin="nutch-extensionpoints"/>
-  </requires>
-
-  <extension id="org.apache.nutch.any23.Any23ParseFilter"
-    name="Any23 parser"
-    point="org.apache.nutch.parse.HtmlParseFilter">
-
-    <implementation id="Any23Parser"
-      class="org.apache.nutch.any23.Any23ParseFilter"/>
-  </extension>
-
-  <extension id="org.apache.nutch.any23.Any23IndexingFilter"
-    name="Any23 indexing filter"
-    point="org.apache.nutch.indexer.IndexingFilter">
-
-    <implementation id="Any23IndexingFilter"
-      class="org.apache.nutch.any23.Any23IndexingFilter"/>
-  </extension>
-
-</plugin>
diff --git a/src/plugin/any23/sample/BBC_News_Scotland.html b/src/plugin/any23/sample/BBC_News_Scotland.html
deleted file mode 100644
index d7cb10a826..0000000000
--- a/src/plugin/any23/sample/BBC_News_Scotland.html
+++ /dev/null
@@ -1,3780 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<html class=" blq-js" xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:rnews="http://iptc.org/std/rNews/2011-10-07#" xml:lang="en-GB"><!-- THIS FILE CONFIGURES SHARED HIGHWEB STATIC ASSETS --><!-- mapping_news.inc --><!-- THIS FILE CONFIGURES NEWS STATIC ASSETS  --><!-- THIS FILE CONFIGURES VOTE 2012 STATIC ASSETS  --><!-- hi/shared/head_initial.inc --><!-- IGOR News --><head profile="http://dublincore.org/documents/dcq-html/" resource="http://www.bbc.co.uk/news/scotland/" typeof="rnews:NewsItem">
-        <meta http-equiv="X-UA-Compatible" content="IE=8">
-        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-        <title>BBC News - Scotland</title>
-        <meta name="Description" content="Get the latest Scottish news from BBC Scotland: breaking news, analysis, features and debate plus audio and video coverage on topical issues from around Scotland">
-        <meta property="rnews:description" content="Get the latest Scottish news from BBC Scotland: breaking news, analysis, features and debate plus audio and video coverage on topical issues from around Scotland">
-                <meta name="OriginalPublicationDate" content="2014/03/31 13:53:03">
-        <meta property="rnews:datePublished" content="2014/03/31 13:53:03">
-        <meta name="UKFS_URL" content="/news/scotland/">
-                <meta name="Headline" content="INDEX ">
-        <meta property="rnews:headline" content="INDEX ">
-        <meta name="IFS_URL" content="/news/scotland/">
-        <meta name="Section" content="Scotland">
-        <meta name="contentFlavor" content="INDEX">
-		                        <meta name="CPS_ID" content="10059375">
-        <meta name="CPS_SITE_NAME" content="BBC News">
-        <meta name="CPS_SECTION_PATH" content="Scotland">
-        <meta name="CPS_ASSET_TYPE" content="IDX">
-        <meta name="CPS_PLATFORM" content="HighWeb">
-        <meta name="CPS_AUDIENCE" content="Domestic">
-        <meta property="rnews:creator" content="http://www.bbc.co.uk#org">
-
-            		<meta property="og:title" content="INDEX ">
-    		<meta property="og:type" content="article">
-    		<meta property="og:url" content="http://www.bbc.co.uk/news/scotland/">
-    		<meta property="og:site_name" content="BBC News">
-            			<meta property="og:image" content="http://newsimg.bbc.co.uk/media/images/67373000/jpg/_67373987_09f1654a-e583-4b5f-bfc4-f05850c6d3ce.jpg">
-											
-				<meta name="bbcsearch_noindex" content="atom">
-		
-        
-            <link rel="canonical" href="http://www.bbc.co.uk/news/scotland/">
-<link rel="alternate" hreflang="en" href="http://www.bbc.com/news/scotland/">
-<link rel="alternate" hreflang="en-gb" href="http://www.bbc.co.uk/news/scotland/">
-        
-
-        							<link href="http://feeds.bbci.co.uk/news/scotland/rss.xml" rel="alternate" type="application/rss+xml" title="BBC News - Scotland">
-					
-        
-
-        <!-- hi/news/head_first.inc -->
-
-<!-- Chartbeat Web Analytics code - start -->
-<script type="text/javascript">var _sf_startpt=(new Date()).getTime()</script>
-<!-- Chartbeat Web Analytics code - end -->
-
-<meta name="application-name" content="BBC News">
-<meta name="msapplication-TileImage" content="/img/1_0_3/cream/hi/news/bbc-news-pin.png">
-<meta name="msapplication-TileColor" content="#CC0101">
-
-
-<meta name="twitter:card" value="summary">
-
-        <!-- Project IGOR - Barlesque redirection logic - START -->
-
-
-    <script>
-        /*<![CDATA[*/
-        window.bbcredirection = {
-            
-                device: true,
-                geo: true
-            
-        }
-        /*]]>*/
-    </script>
-
-<!-- Project IGOR - Barlesque redirection logic - END -->
-
-
-<!-- PULSE_ENABLED:yes -->
-
-<!-- vars inc blq_cachebust_journalism 1.21.17 -->
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-	
-	
-	
-		
-		
-	
-
-
-
-
-
-
-
-
-	
-		
-			
-				
-								
-							
-							
-				
-					
-					
-					
-					
-							
-									
-						
-				
-					
-					
-					
-					
-					
-					
-					
-					
-									
-											
-				
-				
-				
-								
-								
-			
-			
-						
-			
-		
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-		
-	
-		
-
-
-
-
-
-
-	
-		
-
-    
-   <meta http-equiv="X-UA-Compatible" content="IE=8">  
-  <link rel="schema.dcterms" href="http://purl.org/dc/terms/">  <link rel="index" href="http://www.bbc.co.uk/a-z/" title="A to Z"> <link rel="copyright" href="http://www.bbc.co.uk/terms/" title="Terms of Use"> <link rel="icon" href="http://www.bbc.co.uk/favicon.ico" type="image/x-icon">  <meta name="viewport" content="width = 996"> 
-
-<script type="text/javascript">/*<![CDATA[*/
-window.orb = window.orb || {};
-(function() {
-    'use strict';
-    window.fig = window.fig || {};
-    window.fig.manager = {
-                include: function(w) {
-            w = w || window;
-            var d = w.document,
-                c = d.cookie,
-                f = c.match(/ckns_orb_fig=([^;]+)/);
-
-            if ( !f && c.indexOf('ckns_orb_nofig=1') > -1 ) {
-                this.setFig(w, {no:1});
-            }
-            else {
-                if (f) {
-                    eval('f = ' + decodeURIComponent(RegExp.$1) + ';')
-                    this.setFig(w, f);                  
-                }
-                d.write('<script src="https://ssl.live.bbc.co.uk/frameworks/fig/1/fig.js"><'+'/script>');
-            }         
-  
-        },
-                confirm: function(w) {
-            w = w || window;
-            if (w.orb && w.orb.fig && w.orb.fig('no')) {
-               this.setFigCookie(w);
-            }
-            
-            if (w.orb === undefined || w.orb.fig === undefined) {
-                this.setFig(w, {no:1});
-                this.setFigCookie(w);
-            }
-        },
-                setFigCookie: function(w) {
-            w.document.cookie = 'ckns_orb_nofig=1; expires=' + new Date(new Date().getTime() + 1000 * 60 * 10).toGMTString() + ';';
-        },
-                setFig: function(w, f){
-            (function(){var o=f;w.orb=w.orb||{};w.orb.fig=function(k){return (arguments.length)? o[k]:o};})();
-        }
-    }
-})();
-fig.manager.include();
-/*]]>*/</script><script src="BBC_News_Scotland_files/fig.js"></script>
-<script type="text/javascript"> fig.manager.confirm(); </script>
-
- <link rel="stylesheet" type="text/css" href="BBC_News_Scotland_files/main.css">  <script type="text/javascript">/*<![CDATA[*/ (function(undefined){if(!window.bbc){window.bbc={}}var ROLLING_PERIOD_DAYS=30;window.bbc.Mandolin=function(id,segments,opts){var now=new Date().getTime(),storedItem,DEFAULT_START=now,DEFAULT_RATE=1,COOKIE_NAME="ckpf_mandolin";opts=opts||{};this._id=id;this._segmentSet=segments;this._store=new window.window.bbc.Mandolin.Storage(COOKIE_NAME);this._opts=opts;this._rate=(opts.rate!==undefined)?+opts.rate:DEFAULT_RATE;this._startTs=(opts.start!==undefined)?new Date(opts.start).getTime():new Date(DEFAULT_START).getTime();this._endTs=(opts.end!==undefined)?new Date(opts.end).getTime():daysFromNow(ROLLING_PERIOD_DAYS);this._signupEndTs=(opts.signupEnd!==undefined)?new Date(opts.signupEnd).getTime():this._endTs;this._segment=null;if(typeof id!=="string"){throw new Error("Invalid Argument: id must be defined and be a string")}if(Object.prototype.toString.call(segments)!=="[object Array]"){throw new Error("Invalid Argument: Segments are required.")}if(opts.rate!==undefined&&(opts.rate<0||opts.rate>1)){throw new Error("Invalid Argument: Rate must be between 0 and 1.")}if(this._startTs>this._endTs){throw new Error("Invalid Argument: end date must occur after start date.")}if(!(this._startTs<this._signupEndTs&&this._signupEndTs<=this._endTs)){throw new Error("Invalid Argument: SignupEnd must be between start and end date")}removeExpired.call(this,now);if((storedItem=this._store.getItem(this._id))){this._segment=storedItem.segment}else{if(this._startTs<=now&&now<this._signupEndTs&&now<=this._endTs&&this._store.isEnabled()===true){this._segment=pick(segments,this._rate);if(opts.end===undefined){this._store.setItem(this._id,{segment:this._segment})}else{this._store.setItem(this._id,{segment:this._segment,end:this._endTs})}log.call(this,"mandolin_segment")}}log.call(this,"mandolin_view")};window.bbc.Mandolin.prototype.getSegment=function(){return this._segment};function log(actionType,params){var that=this;require(["istats-1"],function(istats){istats.log(actionType,that._id+":"+that._segment,params?params:{})})}function removeExpired(expires){var items=this._store.getItems(),expiresInt=+expires;for(var key in items){if(items[key].end!==undefined&&+items[key].end<expiresInt){this._store.removeItem(key)}}}function getLastExpirationDate(data){var winner=0,rollingExpire=daysFromNow(ROLLING_PERIOD_DAYS);for(var key in data){if(data[key].end===undefined&&rollingExpire>winner){winner=rollingExpire}else{if(+data[key].end>winner){winner=+data[key].end}}}return(winner)?new Date(winner):new Date(rollingExpire)}window.bbc.Mandolin.prototype.log=function(params){log.call(this,"mandolin_log",params)};window.bbc.Mandolin.prototype.convert=function(params){log.call(this,"mandolin_convert",params);this.convert=function(){}};function daysFromNow(n){var endDate;endDate=new Date().getTime()+(n*60*60*24)*1000;return endDate}function pick(segments,rate){var picked,min=0,max=segments.length-1;if(typeof rate==="number"&&Math.random()>rate){return null}do{picked=Math.floor(Math.random()*(max-min+1))+min}while(picked>max);return segments[picked]}window.bbc.Mandolin.Storage=function(name){this._cookieName=name;this._isEnabled=(bbccookies.isAllowed(this._cookieName)===true&&bbccookies.cookiesEnabled()===true)};window.bbc.Mandolin.Storage.prototype.setItem=function(key,value){var storeData=this.getItems();storeData[key]=value;this.save(storeData);return value};window.bbc.Mandolin.Storage.prototype.isEnabled=function(){return this._isEnabled};window.bbc.Mandolin.Storage.prototype.getItem=function(key){var storeData=this.getItems();return storeData[key]};window.bbc.Mandolin.Storage.prototype.removeItem=function(key){var storeData=this.getItems();delete storeData[key];this.save(storeData)};window.bbc.Mandolin.Storage.prototype.getItems=function(){return deserialise(this.readCookie(this._cookieName)||"")};window.bbc.Mandolin.Storage.prototype.save=function(data){window.bbccookies.set(this._cookieName+"="+encodeURIComponent(serialise(data))+"; expires="+getLastExpirationDate(data).toUTCString()+";")};window.bbc.Mandolin.Storage.prototype.readCookie=function(name){var nameEQ=name+"=",ca=window.bbccookies.get().split(";"),i,c;for(i=0;i<ca.length;i++){c=ca[i];while(c.charAt(0)===" "){c=c.substring(1,c.length)}if(c.indexOf(nameEQ)===0){return decodeURIComponent(c.substring(nameEQ.length,c.length))}}return null};function serialise(o){var str="";for(var p in o){if(o.hasOwnProperty(p)){str+='"'+p+'"'+":"+(typeof o[p]==="object"?(o[p]===null?"null":"{"+serialise(o[p])+"}"):'"'+o[p].toString().replace(/"/g,'\\"')+'"')+","}}return str.replace(/,\}/g,"}").replace(/,$/g,"")}function deserialise(str){var o;eval("o = {"+str+"}");return o}})(); /*]]>*/</script>  <script type="text/javascript">/*<![CDATA[*/ if (typeof bbccookies_flag === 'undefined') { bbccookies_flag = 'ON'; } showCTA_flag = true; cta_enabled = (showCTA_flag && (bbccookies_flag === 'ON') ); (function(){var e="ckns_policy",m="Thu, 01 Jan 1970 00:00:00 GMT",k={ads:true,personalisation:true,performance:true,necessary:true};function f(p){if(f.cache[p]){return f.cache[p]}var o=p.split("/"),q=[""];do{q.unshift((o.join("/")||"/"));o.pop()}while(q[0]!=="/");f.cache[p]=q;return q}f.cache={};function a(p){if(a.cache[p]){return a.cache[p]}var q=p.split("."),o=[];while(q.length&&"|co.uk|com|".indexOf("|"+q.join(".")+"|")===-1){if(q.length){o.push(q.join("."))}q.shift()}f.cache[p]=o;return o}a.cache={};function i(o,t,p){var z=[""].concat(a(window.location.hostname)),w=f(window.location.pathname),y="",r,x;for(var s=0,v=z.length;s<v;s++){r=z[s];for(var q=0,u=w.length;q<u;q++){x=w[q];y=o+"="+t+";"+(r?"domain="+r+";":"")+(x?"path="+x+";":"")+(p?"expires="+p+";":"");bbccookies.set(y,true)}}}window.bbccookies={_setEverywhere:i,cookiesEnabled:function(){var o="ckns_testcookie"+Math.floor(Math.random()*100000);this.set(o+"=1");if(this.get().indexOf(o)>-1){g(o);return true}return false},set:function(o){return document.cookie=o},get:function(){return document.cookie},_setPolicy:function(o){return h.apply(this,arguments)},readPolicy:function(o){return b.apply(this,arguments)},_deletePolicy:function(){i(e,"",m)},isAllowed:function(){return true},_isConfirmed:function(){return c()!==null},_acceptsAll:function(){var o=b();return o&&!(j(o).indexOf("0")>-1)},_getCookieName:function(){return d.apply(this,arguments)},_showPrompt:function(){var o=(!this._isConfirmed()&&window.cta_enabled&&this.cookiesEnabled()&&!window.bbccookies_disable);return(window.orb&&window.orb.fig)?o&&(window.orb.fig("no")||window.orb.fig("ck")):o}};bbccookies._getPolicy=bbccookies.readPolicy;function d(p){var o=(""+p).match(/^([^=]+)(?==)/);return(o&&o.length?o[0]:"")}function j(o){return""+(o.ads?1:0)+(o.personalisation?1:0)+(o.performance?1:0)}function h(r){if(typeof r==="undefined"){r=k}if(typeof arguments[0]==="string"){var o=arguments[0],q=arguments[1];if(o==="necessary"){q=true}r=b();r[o]=q}else{if(typeof arguments[0]==="object"){r.necessary=true}}var p=new Date();p.setYear(p.getFullYear()+1);p=p.toUTCString();bbccookies.set(e+"="+j(r)+";domain=bbc.co.uk;path=/;expires="+p+";");bbccookies.set(e+"="+j(r)+";domain=bbc.com;path=/;expires="+p+";");return r}function l(o){if(o===null){return null}var p=o.split("");return{ads:!!+p[0],personalisation:!!+p[1],performance:!!+p[2],necessary:true}}function c(){var o=new RegExp("(?:^|; ?)"+e+"=(\\d\\d\\d)($|;)"),p=document.cookie.match(o);if(!p){return null}return p[1]}function b(o){var p=l(c());if(!p){p=k}if(o){return p[o]}else{return p}}function g(o){return document.cookie=o+"=;expires="+m+";"}function n(){var o='<script type="text/javascript" src="http://static.bbci.co.uk/frameworks/bbccookies/0.6.3/script/bbccookies.js"><\/script>';if(window.bbccookies_flag==="ON"&&!bbccookies._acceptsAll()&&!window.bbccookies_disable){document.write(o)}}n()})(); /*]]>*/</script>      <script type="text/javascript"> if (! window.gloader) { window.gloader = [ "glow", {map: "http://node1.bbcimg.co.uk/glow/glow/map.1.7.7.js"}]; } </script>  <script type="text/javascript" src="BBC_News_Scotland_files/gloader.js"></script><script type="text/javascript" src="BBC_News_Scotland_files/map.js"></script>
-   <script type="text/javascript" src="BBC_News_Scotland_files/require.js"></script> <script type="text/javascript">  bbcRequireMap = {"jquery-1":"http://static.bbci.co.uk/frameworks/jquery/0.3.0/sharedmodules/jquery-1.7.2", "jquery-1.4":"http://static.bbci.co.uk/frameworks/jquery/0.3.0/sharedmodules/jquery-1.4", "jquery-1.9":"http://static.bbci.co.uk/frameworks/jquery/0.3.0/sharedmodules/jquery-1.9.1", "swfobject-2":"http://static.bbci.co.uk/frameworks/swfobject/0.1.10/sharedmodules/swfobject-2", "demi-1":"http://static.bbci.co.uk/frameworks/demi/0.10.0/sharedmodules/demi-1", "gelui-1":"http://static.bbci.co.uk/frameworks/gelui/0.9.13/sharedmodules/gelui-1", "cssp!gelui-1/overlay":"http://static.bbci.co.uk/frameworks/gelui/0.9.13/sharedmodules/gelui-1/overlay.css", "istats-1":"http://static.bbci.co.uk/frameworks/istats/0.17.2/modules/istats-1", "relay-1":"http://static.bbci.co.uk/frameworks/relay/0.2.4/sharedmodules/relay-1", "clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/clock-1", "canvas-clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/canvas-clock-1", "cssp!clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/clock-1.css", "jssignals-1":"http://static.bbci.co.uk/frameworks/jssignals/0.3.6/modules/jssignals-1", "jcarousel-1":"http://static.bbci.co.uk/frameworks/jcarousel/0.1.10/modules/jcarousel-1"}; require({ baseUrl: 'http://static.bbci.co.uk/', paths: bbcRequireMap, waitSeconds: 30 }); </script>      <script type="text/javascript" src="BBC_News_Scotland_files/barlesque.js"></script>
-  
-<!--[if IE 6]>
-        <script type="text/javascript">
-        try {
-            document.execCommand("BackgroundImageCache",false,true);
-        } catch(e) {}
-    </script>
-        <style type="text/css">
-        /* Use filters for IE6 */
-        #blq-blocks a {
-            background-image: none;
-            filter: progid:DXImageTransform.Microsoft.AlphaImageLoader(src='http://static.bbci.co.uk/frameworks/barlesque/2.60.1/desktop/3.5//img/blq-blocks_white_alpha.png', sizingMethod='image');
-        }
-        .blq-masthead-focus #blq-blocks a,
-        .blq-mast-text-dark #blq-blocks a {
-            background-image: none;
-            filter: progid:DXImageTransform.Microsoft.AlphaImageLoader(src='http://static.bbci.co.uk/frameworks/barlesque/2.60.1/desktop/3.5//img/blq-blocks_grey_alpha.png', sizingMethod='image');
-        }
-        #blq-nav-search button span {
-            background-image: none;
-            filter: progid:DXImageTransform.Microsoft.AlphaImageLoader(src='http://static.bbci.co.uk/frameworks/barlesque/2.60.1/desktop/3.5//img/blq-search_grey_alpha.png', sizingMethod='image');
-        }
-        #blq-nav-search button img {
-            position: absolute;
-            left: -999em;    
-        }
-    </style>
-<![endif]-->
-
-<!--[if (IE 7])|(IE 8)>
-    <style type="text/css">
-        .blq-clearfix {
-            display: inline-block;
-        }
-    </style>
-<![endif]-->
-
-<script type="text/javascript">
-     blq.setEnvironment('live');  if (blq.setLabel) blq.setLabel('searchSuggestion', "Search");  if (! /bbc\.co\.uk$/i.test(window.location.hostname) ) { document.documentElement.className += ' blq-not-bbc-co-uk'; } </script>
-
-  <script type="text/javascript"> /*<![CDATA[*/ function oqsSurveyManager(w, flag) {  var defaultThreshold = 0.7, usePulseThreshold = (flag === 'OFF')? 1 : defaultThreshold, activeThreshold; w.oqs = w.oqs || {}; if ( w.document.cookie.match(/(?:^|; *)ckns_oqs_usePulseThreshold=([\d.]+)/) ) { activeThreshold = RegExp.$1; } else if (typeof w.oqs_usePulseThreshold !== 'undefined') { activeThreshold = w.oqs_usePulseThreshold; } else { activeThreshold = usePulseThreshold; } w.oqs.usePulse = (w.Math.random() < activeThreshold); if (!w.oqs.usePulse) {  w.document.write('<script type="text/javascript" src="http://static.bbci.co.uk/frameworks/barlesque/2.60.1/desktop/3.5/script/vendor/edr.js"><'+'/script>'); } } oqsSurveyManager(window, 'ON'); /*]]>*/ </script> 
- <script type="text/javascript"> window.pulse = { init: function(){/*stub*/} }; if (!window.oqs || oqs.usePulse) { document.write('<script type="text/javascript" src="http://static.bbci.co.uk/frameworks/pulsesurvey/0.10.1/script/pulse.js"><'+'/script>'); document.write('<script type="text/javascript" src="http://www.bbc.co.uk/survey/pulse/conf.js"><'+'/script>'); } </script><script type="text/javascript" src="BBC_News_Scotland_files/pulse.js"></script><script src="BBC_News_Scotland_files/jquery-1_002.js" data-requiremodule="jquery-1.9" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script type="text/javascript" src="BBC_News_Scotland_files/conf.js"></script> <script type="text/javascript"> pulse.translations = { intro: 'We are always looking to improve the site and your opinions count.', question: 'Do you have a few minutes to tell us what you think about this site?', accept: 'Yes', reject: 'No' };  </script> <link rel="stylesheet" href="BBC_News_Scotland_files/pulse.css" type="text/css"> <!--[if gte IE 6]> <style type="text/css"> .pulse-pop li{display:inline;width:40px} </style> <![endif]--> <!--[if IE 6]> <style type="text/css"> .pulse-pop li{display:inline;width:40px} .pulse-pop #pulse-q{background:url(http://static.bbci.co.uk/frameworks/pulsesurvey/0.10.1/img/pulse_bg.gif) no-repeat} .pulse-pop #pulse-a{background:url(http://static.bbci.co.uk/frameworks/pulsesurvey/0.10.1/img/pulse_bg.gif) bottom no-repeat;} </style> <![endif]--> <!--[if IE 7]> <style type="text/css">  .pulse-pop #pulse-a{zoom:1} </style> <![endif]-->  <script type="text/javascript"> /* <![CDATA[ */ define('id-statusbar-config', { 'translation_signedout': "Sign in", 'translation_signedin': "Your account", 'use_overlay' : false, 'signin_url' : "https://ssl.bbc.co.uk/id/signin?ptrt=http://www.bbc.co.uk/news/scotland/", 'locale' : "en-GB", 'policyname' : "", 'ptrt' : "http://www.bbc.co.uk/news/scotland/" }); /* ]]> */ </script>  <script type="text/javascript"> (function () { if (! window.require) { throw new Error('idcta: could not find require'); } var map = {}; map['idapp-1'] = 'http://static.bbci.co.uk/idapp/0.63.3/modules/idapp/idapp-1'; map['idcta/idcta-1'] = 'http://static.bbci.co.uk/id/0.23.1/modules/idcta/idcta-1'; require({paths: map}); define('id-config', {"idapp":{"version":"0.63.3","hostname":"ssl.bbc.co.uk","insecurehostname":"www.bbc.co.uk","tld":"bbc.co.uk"},"idtranslations":{"version":"0.27.2"},"identity":{"baseUrl":"https:\/\/talkback.live.bbc.co.uk\/identity"},"pathway":{"name":null,"staticAssetUrl":"http:\/\/static.bbci.co.uk\/idapp\/0.63.3\/modules\/idapp\/idapp-1\/View.css"}}); })(); </script>                          
-	
-
-
-
-
-		<!-- shared/head -->
-<meta http-equiv="imagetoolbar" content="no">
-<!--[if !(lt IE 6)]>
-   	<link rel="stylesheet" type="text/css" href="http://news.bbcimg.co.uk/view/3_0_19/cream/hi/shared/type.css" />
-
-
-		<link rel="stylesheet" type="text/css" media="screen" href="http://news.bbcimg.co.uk/view/3_0_19/cream/hi/shared/global.css" />
-
-
-	<link rel="stylesheet" type="text/css" media="print" href="http://news.bbcimg.co.uk/view/3_0_19/cream/hi/shared/print.css" />
-
-	<link rel="stylesheet" type="text/css" media="screen and (max-device-width: 976px)" href="http://news.bbcimg.co.uk/view/3_0_19/cream/hi/shared/mobile.css" />
-	
-
-
-
-<link rel="stylesheet" type="text/css" href="http://news.bbcimg.co.uk/view/3_0_19/cream/hi/shared/components/components.css" />
-
-<![endif]-->
-<!--[if !IE]>-->
-   	<link rel="stylesheet" type="text/css" href="BBC_News_Scotland_files/type.css">
-
-
-		<link rel="stylesheet" type="text/css" media="screen" href="BBC_News_Scotland_files/global.css">
-
-
-	<link rel="stylesheet" type="text/css" media="print" href="BBC_News_Scotland_files/print.css">
-
-	<link rel="stylesheet" type="text/css" media="screen and (max-device-width: 976px)" href="BBC_News_Scotland_files/mobile.css">
-	
-
-
-
-<link rel="stylesheet" type="text/css" href="BBC_News_Scotland_files/components.css">
-
-<!--<![endif]-->
-<script type="text/javascript">
-/*<![CDATA[*/
-gloader.load(["glow","1","glow.dom"],{onLoad:function(glow){glow.dom.get("html").addClass("blq-js")}});
-gloader.load(["glow","1","glow.dom"],{onLoad:function(glow){glow.ready(function(){if (glow.env.gecko){var gv = glow.env.version.split(".");for (var i=gv.length;i<4;i++){gv[i]=0;}if((gv[0]==1 && gv[1]==9 && gv[2]==0)||(gv[0]==1 && gv[1]<9)||(gv[0]<1)){glow.dom.get("body").addClass("firefox-older-than-3-5");}}});}});
-
-window.disableFacebookSDK=true;
-if (window.location.pathname.indexOf('+')>=0){window.disableFacebookSDK=true;}
-
-/*]]>*/
-</script><script type="text/javascript" src="BBC_News_Scotland_files/core_002.js" class="gloaded sync"></script>
-
-<script type="text/javascript" src="BBC_News_Scotland_files/locator.js"></script><script type="text/javascript" src="BBC_News_Scotland_files/widgets.js" class="gloaded sync"></script><link class="gloaded async" type="text/css" rel="stylesheet" href="BBC_News_Scotland_files/widgets.css">
-
-
-<script type="text/javascript" src="BBC_News_Scotland_files/bbc_fmtj.js"></script>
-
-<script type="text/javascript">
-<!--
-	bbc.fmtj.page = {
-		serverTime: 1396274136000,
-		editionToServe: 'domestic',
-		queryString: null,
-		section: 'scotland',
-		sectionPath: '/Scotland',
-		siteName: 'BBC News',
-		siteToServe: 'news',
-		siteVersion: 'cream',
-		storyId: '10059375',
-		assetType: 'index',
-		uri: '/news/scotland/',
-		country: 'gb',
-		masthead: false,
-		adKeyword: null,
-		templateVersion: 'v1_0'
-	}
--->
-</script>
-<script type="text/javascript" src="BBC_News_Scotland_files/bbc_fmtj_common.js"></script>
-
-
-<script type="text/javascript">$useMap({map:"http://news.bbcimg.co.uk/js/map/map_0_0_33.js"});</script><script type="text/javascript" src="BBC_News_Scotland_files/map_0_0_33.js"></script>
-
-<script type="text/javascript">$loadView("0.0",["bbc.fmtj.view"]);</script><script type="text/javascript" src="BBC_News_Scotland_files/view.js" class="gloaded sync"></script>
-
-<script type="text/javascript">$render("livestats-heatmap");</script>
-
-
-<script type="text/javascript" src="BBC_News_Scotland_files/bbc_fmtj_config.js"></script>
-
-
-
-
-<script type="text/javascript">
-    //<![CDATA[
-        require(['jquery-1'], function($){
-            
-            // set up EMP once it's loaded
-            var setUp = function(){
-                // use our own pop out page
-        	    embeddedMedia.setPopoutUrl('/player/emp/2_0_55/popout/pop.stm');
-
-        	    // store EMP's notifyParent function
-        	    var oldNotifyParent = embeddedMedia.console.notifyParent;
-        	    // use our own to add livestats to popout
-        	    embeddedMedia.console.notifyParent = function(childWin){
-        	        oldNotifyParent(childWin);
-        	        // create new live stats url
-                    var liveStatsUrl = bbc.fmtj.av.emp.liveStatsForPopout($('#livestats').attr('src'));
-                    var webBug = $('<img />', {
-                                     id:  'livestats',
-                                     src: liveStatsUrl
-                                 });
-                    // append it to popout
-                    $(childWin.document).find('body').append(webBug);
-                }
-            }
-                
-            // check if console is available to manipulate
-            if(window.embeddedMedia && window.embeddedMedia.console){
-                setUp();
-            }
-            // otherwise emp is still loading, so add event listener
-            else{
-                $(document).bind('empReady', function(){
-                    setUp();
-                });
-            }
-        });
-    //]]>
-</script>
-
-
-		
-	<!-- get BUMP from cdn -->
-    <script type="text/javascript" src="BBC_News_Scotland_files/bump.js"></script>
-
-<!-- load glow and required modules -->
-<script type="text/javascript">
-    //<![CDATA[
-        gloader.load(['glow', '1', 'glow.dom']);
-    //]]>
-</script>
-
-
-
-	<!-- pull in our emp code -->
-	<script type="text/javascript" src="BBC_News_Scotland_files/emp.js"></script>
-	<!-- pull in compatibility.js -->
-	<script type="text/javascript" src="BBC_News_Scotland_files/compatibility.js"></script>
-
-
-<script type="text/javascript">
-	//<![CDATA[
-	    
-	
-	    
-	        
-	    
-	
-	    
-	        
-	    
-	
-	    // set site specific config
-	    
-	        bbc.fmtj.av.emp.configs.push('news');
-	    
-	    
-	    // when page loaded, write all created emps
-	    glow.ready(function(){
-			if(typeof bbcdotcom !== 'undefined' && bbcdotcom.av && bbcdotcom.av.emp){
-				bbcdotcom.av.emp.configureAll();
-			}
-			embeddedMedia.each(function(emp){
-				emp.set('enable3G', true);
-				emp.setMediator('href', '{protocol}://{host}/mediaselector/5/select/version/2.0/mediaset/{mediaset}/vpid/{id}');						
-			});
-			embeddedMedia.writeAll();
-	        // mark the emps as loaded
-	        bbc.fmtj.av.emp.loaded = true;
-			
-			
-	    });
-	//]]>
-</script>
-<!-- Check for advertising testing -->
-
-<meta name="viewport" content="width = 996">
-
-
-
-        <!-- shared/head_index -->
-<!-- THESE STYLESHEETS VARY ACCORDING TO PAGE CONTENT -->
-
-<link rel="stylesheet" type="text/css" media="screen" href="BBC_News_Scotland_files/index.css">
-
-
-<!-- js index view -->
-<script type="text/javascript">$loadView("0.0",["bbc.fmtj.view.news.index"]);</script><script src="BBC_News_Scotland_files/jquery-1.js" data-requiremodule="jquery-1" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/demi-1.js" data-requiremodule="demi-1" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script type="text/javascript" src="BBC_News_Scotland_files/news-index.js" class="gloaded sync"></script><script src="BBC_News_Scotland_files/useragent.jsonp" data-requiremodule="http://open.live.bbc.co.uk/wurfldemi/useragent.jsonp?callback=define&amp;ua=Mozilla%2F5.0%20(X11%3B%20Ubuntu%3B%20Linux%20i686%3B%20rv%3A28.0)%20Gecko%2F20100101%20Firefox%2F28.0" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script>
-
-
-
-        
-        <!-- #CREAM hi news domestic head.inc -->
-
-<script type="text/javascript">
-	if(undefined !== bbc && undefined !== bbc.fmtj){
-		bbc.fmtj.makeNewsSurveyConfig = function(surveyId,probabilityRate){	
-			if(surveyId !== undefined){
-				pulse.localSurvey = {
-					'active' : true,
-					'URLFormat' : 'http://ecustomeropinions.com/survey/survey.php?sid='+ surveyId,
-					'probability' : probabilityRate,
-					'translations' : {
-						'intro' : 'We are always looking to improve the site and your opinions count.',
-						'question' : 'Do you have a few minutes to tell us what you think about this site?'
-					}				
-				}
-							
-							
-			}
-		};
-	}
-	</script>
-
-	
-    
-
-
-
-                            <!-- is suitable for ads adding isadvertise ... -->
-			
-
-			 
-
-			
-	
-	
-<script type="text/javascript">/*<![CDATA[*/if(bbcdotcom===undefined){var bbcdotcom={init:function(a){},objects:function(a){return false},domLoaded:false,data:{ads:0,stats:0,statsProvider:null},adverts:{slot:function(b,a){},layout:{reset:function(){}}},config:{isActive:function(a){return false}},addLoadEvent:function(a){var b=window.onload;if(typeof window.onload!="function"){window.onload=a}else{window.onload=function(){if(b){b()}a()}}}}}bbcdotcom.objects=function(d,e,f){var b,c,a;b=d.split(".");if(e===undefined){e="valid"}if(f===undefined){f=window}for(c=0,a=b.length;c<a;c++){if(f[b[c]]===undefined){if(e==="create"){f[b[c]]={}}else{return false}}else{if(typeof f[b[c]]==="function"){if(f[b[c]]()!==undefined){return f[b[c]]()}}}f=f[b[c]]}return f};bbcdotcom.objects("bbcdotcom.config","create");bbcdotcom.config=(function(e,c){var b=e,a=c,d="",f="",g="";return{setAdsEnabled:function(h){b=(b!==0)?h:0},isAdsEnabled:function(){return b},setAnalyticsEnabled:function(h){a=(a!==0)?h:0},isAnalyticsEnabled:function(){return a},setJsPrefix:function(h){d=h},getJsPrefix:function(){return d},setSwfPrefix:function(h){f=h},getSwfPrefix:function(){return f},setCssPrefix:function(h){g=h},getCssPrefix:function(){return g}}}(1,1));bbcdotcom.objects("bbcdotcom.stats","create");if(BBC===undefined){var BBC={}}if(BBC.adverts===undefined){BBC.adverts={setZone:function(){},configure:function(){},write:function(){},show:function(){},isActive:function(){return false},setScriptRoot:function(){},setImgRoot:function(){},getAdvertTag:function(){},getSectionPath:function(){}}};/*]]>*/</script>
-<meta name="application-name" content="BBC">
-<meta name="msapplication-tooltip" content="Explore the BBC, for latest news, sport and weather, TV &amp; radio schedules and highlights, with nature, food, comedy, children's programmes and much more">
-
-	<meta name="msapplication-starturl" content="http://www.bbc.com/news/?ocid=global-news-pinned-ie9">
-
-<meta name="msapplication-window" content="width=1024;height=768">
-<meta name="msapplication-task" content="name=BBC Home;action-uri=http://www.bbc.com/?ocid=global-homepage-pinned-ie9;icon-uri=http://news.bbcimg.co.uk/shared/img/bbccom/favicon_16.ico">
-<meta name="msapplication-task" content="name=BBC News;action-uri=http://www.bbc.com/news/?ocid=global-news-pinned-ie9;icon-uri=http://news.bbcimg.co.uk/shared/img/bbccom/favicon_16.ico">
-<meta name="msapplication-task" content="name=BBC Sport;action-uri=http://www.bbc.com/sport/0/?ocid=global-sport-pinned-ie9;icon-uri=http://news.bbcimg.co.uk/shared/img/bbccom/favicon_16.ico">
-<meta name="msapplication-task" content="name=BBC Future;action-uri=http://www.bbc.com/future?ocid=global-future-pinned-ie9;icon-uri=http://news.bbcimg.co.uk/shared/img/bbccom/favicon_16.ico">
-<meta name="msapplication-task" content="name=BBC Travel;action-uri=http://www.bbc.com/travel?ocid=global-travel-pinned-ie9;icon-uri=http://news.bbcimg.co.uk/shared/img/bbccom/favicon_16.ico">
-<meta name="msapplication-task" content="name=BBC Weather;action-uri=http://www.bbc.co.uk/weather/?ocid=global-weather-pinned-ie9;icon-uri=http://news.bbcimg.co.uk/shared/img/bbccom/favicon_16.ico">
-<!-- BBCCOM client-side -->
-
-
-<style type="text/css">.bbccom_display_none{display:none;}</style>
-
-
-<script type="text/javascript">/*<![CDATA[*/
-    if (typeof orb !== 'undefined' && typeof orb.fig === 'function') {
-        bbcdotcom.data = {
-            a: orb.fig('ad')? 1 : 0,
-            b: (0 == orb.fig('uk')) ? 1 : 0,
-            c: orb.fig('ap')
-        };
-    } else {
-        document.write(unescape('%3Cscript type="text/javascript" src="http://tps.bbc.com/wwscripts/data"%3E%3C/script%3E'));
-    }
-/*]]>*/</script>
-<script type="text/javascript">/*<![CDATA[*/
-    if (typeof bbcdotcom.data == 'undefined' || typeof bbcdotcom.data.a == 'undefined' || typeof bbcdotcom.data.b == 'undefined' || typeof bbcdotcom.data.c == 'undefined') {
-        bbcdotcom.data = {a:0,b:0,c:0};
-    }
-    if (bbcdotcom.data.a == 1) {
-        document.write(unescape('%3Clink href="http://news.bbcimg.co.uk/css/screen/shared/0.3.236/3pt_ads.css" rel="stylesheet" type="text/css" /%3E'));
-        
-            document.write(unescape('%3Cscript type="text/javascript" src="http://www.bbc.co.uk/wwscripts/flag"%3E%3C/script%3E'));
-        
-    }
-/*]]>*/</script>
-<script type="text/javascript">/*<![CDATA[*/
-    if (typeof bbcdotcom.flag == 'undefined' || bbcdotcom.flag.a != 1) {
-        bbcdotcom.data.a = 0;
-    }
-    if (bbcdotcom.data.a == 1 || bbcdotcom.data.b == 1) {
-        document.write(unescape('%3Cscript type="text/javascript" src="http://news.bbcimg.co.uk/js/app/bbccom/0.3.236/bbccom.js"%3E%3C/script%3E'));
-    }
-/*]]>*/</script>
-<script type="text/javascript">/*<![CDATA[*/
-    if (bbcdotcom.data.a == 1 || bbcdotcom.data.b == 1) {
-        BBC.adverts.setData(bbcdotcom.data);
-        bbcdotcom.config.setAdsEnabled(bbcdotcom.data.a);
-        bbcdotcom.config.setAnalyticsEnabled(bbcdotcom.data.b);
-        if(typeof bbcdotcom !== 'undefined' && typeof bbcdotcom.survey !== 'undefined' && typeof bbcdotcom.survey.init === 'function') {
-            bbcdotcom.survey.init();
-        }
-    }
-    bbcdotcom.objects('page', 'create', bbcdotcom);
-    bbcdotcom.page.edition = '(none)';
-    bbcdotcom.page.url = '/news/scotland/';
-    bbcdotcom.page.zoneFile = '3pt_zone_file';
-    bbcdotcom.page.http_host = 'www.bbc.co.uk';
-    
-    
-/*]]>*/</script>
-<script type="text/javascript">/*<![CDATA[*/if(bbcdotcom.data.a == 1){(function(){switch(bbcdotcom.page.url){case"/":case"/default.stm":bbcdotcom.page.url=(bbcdotcom.page.edition==="domestic")?"/1/hi/default.stm":"/2/hi/default.stm";break;case"/sport":case"/sport/":case"/sport/default.stm":bbcdotcom.page.url=(bbcdotcom.page.edition==="domestic")?"/sport1/hi/default.stm":"/sport2/hi/default.stm";break}BBC.adverts.setScriptRoot("http://news.bbcimg.co.uk/js/app/bbccom/0.3.236/");bbcdotcom.config.setJsPrefix("http://news.bbcimg.co.uk/js/app/bbccom/0.3.236");bbcdotcom.config.setSwfPrefix("http://news.bbcimg.co.uk/shared/swf/bbccom/0.3.236");bbcdotcom.config.setCssPrefix("http://news.bbcimg.co.uk/css/screen/shared/0.3.236");BBC.adverts.setImgRoot("http://news.bbcimg.co.uk/shared/img/bbccom/");BBC.adverts.init({domain:bbcdotcom.page.http_host,location:bbcdotcom.page.url,zoneVersion:bbcdotcom.page.zoneFile,zoneReferrer:document.referrer})}());(function(){if(typeof require!=="undefined"){require({paths:{bbcdotcom:"http://news.bbcimg.co.uk/js/app/bbccom/0.3.236"}})}})();if(BBC.adverts.getV6Gvl3()&&"undefined"!==typeof bbcdotcom.page.bddUseLatestFromTest){document.write(unescape('%3Cscript type="text/javascript" src="http://wwwpreview.test.newsonline.tc.nca.bbc.co.uk/js/app/bbccom/'+bbcdotcom.latest_version+'/advert.js"'))}else{if(BBC.adverts.getV6Gvl3()){document.write(unescape('%3Cscript type="text/javascript" src="http://news.bbcimg.co.uk/js/app/bbccom/0.3.236/advert.js"%3E%3C/script%3E'))}}if(/[?|&]metadata=yes(&|$)/.test(window.location.search)){document.write("http://www.test.bbc.com/wwscripts/metadata?bbcdotcom_asset="+window.location.hostname+window.location.pathname)};}/*]]>*/</script>
-<script type="text/javascript">
-    /*<![CDATA[*/
-    if (BBC.adverts.isActive('analytics')) {
-        document.write(unescape('%3Cscript id="gnlAnalyticsEnabled" class="bbccom_display_none"%3E%3C/script%3E'));
-    }
-    /*]]>*/
-</script>
-
-
-    
-        
-
-<script type="text/javascript">
-	if(typeof BBC.adverts != 'undefined' && typeof BBC.adverts.setPageVersion != 'undefined'){
-		BBC.adverts.setPageVersion('(none)');
-	}
-</script>
-
-
-    
-
-        		<!-- hi/news/head_last.inc -->
-
-<script type="text/javascript">
-
-function BetaSite() {
-  function setMobileCookie() {
-    var d = new Date ();
-    d.setDate(d.getDate() + 1);
-    d = d.toUTCString();
-
-    window.bbccookies.set('ckps_d=m;domain=.bbc.co.uk;path=/;expires=' + d );
-    window.bbccookies.set('ckps_d=m;domain=.bbc.com;path=/;expires=' + d );
-  }
-
-  function isValidQueryString() {
-    return (window.location.search.indexOf('view=beta') !== -1);
-  }
-
-  function isClientCapable() {
-    return ('querySelector' in document && 'localStorage' in window && 'addEventListener' in window);
-  }
-
-  if (isClientCapable() && isValidQueryString()) {
-    setMobileCookie();
-
-    window.location.host = 'm.bbc.co.uk';
-  }
-}
-
-BetaSite();
-
-</script>
-
-
-<link rel="stylesheet" type="text/css" media="screen" href="BBC_News_Scotland_files/skin.css">
-
-
-<link rel="apple-touch-icon" href="http://news.bbcimg.co.uk/img/1_0_3/cream/hi/news/iphone.png">
-<script type="text/javascript">
-    bbcRequireMap['module/weather'] = '/inc/specials/cream/hi/news/personalisation/weather';
-    bbcRequireMap['module/local'] = '/inc/specials/cream/hi/news/personalisation';
-    bbcRequireMap['module/localnewsandweather'] = '/inc/specials/cream/hi/news/personalisation/localnewsandweather';
-    bbcRequireMap['translation'] = 'module/translations/en-GB';
-    require({ baseUrl: 'http://static.bbci.co.uk/', paths: bbcRequireMap, waitSeconds: 30 });
-</script>
-<script type="text/javascript">
-
-
-
-    
-
-
-
-
-
-
-
-
-
-
-
-    (function(){
-        var path_prefix    = "http://www.live.bbc.co.uk/nfp";
-
-        define('config', function () {
-            return {
-              "pathPrefix": path_prefix.replace('.live', ''),
-              "service": 'nfp',
-              "local": {
-                "simple": true,
-                "story_limit": 6,
-                "weather_type": "daily",
-                "append_ext": ".inc",
-                "allowAutoComplete": true,
-                "allowLocationLookup": false
-              }
-            };
-        });
-    }());
-
-
-
-require(['jquery-1'], function(jQuery) {
-    jQuery.ajaxSetup({cache: true})
-});
-
-require(["jquery-1", "istats-1"], function ($, istats) {
-    $(function() {
-        istats.track('external', {region: $('.story-body'), linkLocation : 'story-body'});
-        istats.track('external', {region: $('.story-related .related-links'), linkLocation : 'related-links'});
-        istats.track('external', {region: $('.story-related .newstracker-list'), linkLocation : 'newstracker'});
-    });
-});
-
-</script>
-
-
-
-
-
-	
-    		
-		
-	
-    	
-   
-
-<!-- CPS COMMENT STATUS: false -->
-
-
-
-
-
-
-	   <!--Rendered by NOLAPPS203-6001 -->
-	   <link rel="schema.dcterms" href="http://purl.org/dc/terms/">
-	   <meta name="DCTERMS.created" content="2010-07-05T16:02:59+00:00">
-	   <meta name="DCTERMS.modified" content="2014-03-31T13:53:03+00:00">
-    <script src="BBC_News_Scotland_files/istats-1.js" data-requiremodule="istats-1" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/idcta-1.js" data-requiremodule="idcta/idcta-1" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/gelui-1.js" data-requiremodule="gelui-1" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/embed.js" data-requiremodule="http://emp.bbci.co.uk/emp/worldwide/embed.js?mediaset=journalism-pc" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/overlay.js" data-requiremodule="gelui-1/overlay" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/autosuggest.js" data-requiremodule="gelui-1/autosuggest" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/carousel.js" data-requiremodule="gelui-1/carousel" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/jssignals-1.js" data-requiremodule="jssignals-1" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/core.js" data-requiremodule="gelui-1/core" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/widget.js" data-requiremodule="gelui-1/widget" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><link href="BBC_News_Scotland_files/overlay.css" rel="stylesheet" type="text/css"><script src="BBC_News_Scotland_files/jcarousel-1.js" data-requiremodule="jcarousel-1" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script><script src="BBC_News_Scotland_files/swfobject-2.js" data-requiremodule="swfobject-2" data-requirecontext="_" async="" charset="utf-8" type="text/javascript"></script></head>
-
-        	    <!--[if lte IE 6]><body class="news ie disable-wide-advert"><![endif]-->
-    <!--[if IE 7]><body class="news ie7 disable-wide-advert"><![endif]-->
-    <!--[if IE 8]><body class="news ie8 disable-wide-advert"><![endif]-->
-    <!--[if !IE]>--><body class="news disable-wide-advert glow177-gecko"><!--<![endif]-->
-	
-
-
-<div class="livestats-web-bug" id="livestats-web-bug-tag"><img alt="" id="livestats" src="BBC_News_Scotland_files/o.gif"></div>
-
-<script type="text/JavaScript">
-	var referrer = document.referrer;
-
-	var livestatsBug = "<img alt='' id='livestats' src='http://stats.bbc.co.uk/o.gif?~RS~s~RS~News~RS~t~RS~HighWeb_Index~RS~i~RS~0~RS~p~RS~99112~RS~a~RS~Domestic~RS~u~RS~/news/scotland/~RS~r~RS~"+referrer+"~RS~q~RS~~RS~z~RS~36~RS~'>";
-
-
-	document.getElementById('livestats-web-bug-tag').innerHTML = livestatsBug;
-</script>
-
-
-<noscript>
-<div class="livestats-web-bug"><img alt="" id="livestats" src="http://stats.bbc.co.uk/o.gif?~RS~s~RS~News~RS~t~RS~HighWeb_Index~RS~i~RS~0~RS~p~RS~99112~RS~a~RS~Domestic~RS~u~RS~/news/scotland/~RS~q~RS~~RS~z~RS~36~RS~"/></div>
-</noscript>
-        
-	
-    <!-- BBCDOTCOM body first include -->
-    
-
-<script type="text/javascript">BBC.adverts.write("wallpaper",false);</script>
-<script type="text/javascript">/*<![CDATA[*/BBC.adverts.wallpaperBodyTag=document.getElementsByTagName("body");BBC.adverts.wallpaperATag=document.getElementsByTagName("a");if("undefined"!=typeof BBC.adverts.wallpaperATag&&"undefined"!=typeof BBC.adverts.wallpaperATag[0]&&"undefined"!=typeof BBC.adverts.wallpaperBodyTag&&"undefined"!=typeof BBC.adverts.wallpaperBodyTag[0]&&-1!==BBC.adverts.wallpaperATag[0].href.indexOf("http://ad.doubleclick.net")){BBC.adverts.wallpaperBodyTag[0].removeChild(BBC.adverts.wallpaperATag[0])};/*]]>*/</script>
-
-
-	
-	
-
-    <!-- ISTATS -->
-
-
-
-    
-
-
-
-
-
-
- <script type="text/javascript">/*<![CDATA[*/ bbcFlagpoles_istats = 'ON'; istatsTrackingUrl = '//sa.bbc.co.uk/bbc/bbc/s?name=news.scotland.page&cps_asset_id=10059375&page_type=index&section=scotland&app_version=6.2.180-RC3&first_pub=2010-07-05T16:02:59+00:00&last_editorial_update=2014-03-31T13:53:03+00:00&title=&comments_box=false&cps_media_type=&cps_media_state=&by_nation=&app_type=web&ml_name=SSI&ml_version=0.17.2&language=en-GB'; if (window.istats_countername) { istatsTrackingUrl = istatsTrackingUrl.replace(/([?&]name=)[^&]+/ig, '$1' + istats_countername); } (function() { if ( /\bIDENTITY=/.test(document.cookie) ) { istatsTrackingUrl += '&bbc_identity=1'; } var c = (document.cookie.match(/\bckns_policy=(\d\d\d)/)||[]).pop() || ''; istatsTrackingUrl += '&bbc_mc=' + (c? 'ad'+c.charAt(0)+'ps'+c.charAt(1)+'pf'+c.charAt(2) : 'not_set'); if ( /\bckns_policy=\d\d0/.test(document.cookie) ) { istatsTrackingUrl += '&ns_nc=1'; } var screenWidthAndHeight = 'unavailable'; if (window.screen && screen.width && screen.height) { screenWidthAndHeight = screen.width + 'x' + screen.height; } istatsTrackingUrl += ('&screen_resolution=' + screenWidthAndHeight); istatsTrackingUrl += '&blq_s=3.5&blq_r=3.5&blq_v=journalism-domestic'; })(); /*]]>*/</script>  <!-- Begin iStats 20100118 (UX-CMC 1.1009.3) --> <script type="text/javascript">/*<![CDATA[*/ (function() { window.istats || (istats = {}); var cookieDisabled = (document.cookie.indexOf('NO-SA=') != -1), hasCookieLabels = (document.cookie.indexOf('sa_labels=') != -1), hasClickThrough = /^#sa-(.*?)(?:-sa(.*))?$/.test(document.location.hash), runSitestat = !cookieDisabled && !hasCookieLabels && !hasClickThrough && !istats._linkTracked; if (runSitestat && bbcFlagpoles_istats === 'ON') { sitestat(istatsTrackingUrl); } else { window.ns_pixelUrl = istatsTrackingUrl; /* used by Flash library to track */ } function sitestat(n){var j=document,f=j.location,b="";if(j.cookie.indexOf("st_ux=")!=-1){var k=j.cookie.split(";");var e="st_ux",h=document.domain,a="/";if(typeof ns_!="undefined"&&typeof ns_.ux!="undefined"){e=ns_.ux.cName||e;h=ns_.ux.cDomain||h;a=ns_.ux.cPath||a}for(var g=0,f=k.length;g<f;g++){var m=k[g].indexOf("st_ux=");if(m!=-1){b="&"+unescape(k[g].substring(m+6))}}document.cookie=e+"=; expires="+new Date(new Date().getTime()-60).toGMTString()+"; path="+a+"; domain="+h}ns_pixelUrl=n;n=ns_pixelUrl+"&ns__t="+(new Date().getTime())+"&ns_c="+((j.characterSet)?j.characterSet:j.defaultCharset)+"&ns_ti="+escape(j.title)+b+"&ns_jspageurl="+escape(f&&f.href?f.href:j.URL)+"&ns_referrer="+escape(j.referrer);if(n.length>2000&&n.lastIndexOf("&")){n=n.substring(0,n.lastIndexOf("&")+1)+"ns_cut="+n.substring(n.lastIndexOf("&")+1,n.lastIndexOf("=")).substring(0,40)}(j.images)?new Image().src=n:j.write('<p><i'+'mg src="'+n+'" height="1" width="1" alt="" /></p>')}; })(); /*]]>*/</script> <noscript><p style="position: absolute; top: -999em;"><img src="//sa.bbc.co.uk/bbc/bbc/s?name=news.scotland.page&amp;cps_asset_id=10059375&amp;page_type=index&amp;section=scotland&amp;app_version=6.2.180-RC3&amp;first_pub=2010-07-05T16:02:59+00:00&amp;last_editorial_update=2014-03-31T13:53:03+00:00&amp;title=&amp;comments_box=false&amp;cps_media_type=&amp;cps_media_state=&amp;by_nation=&amp;app_type=web&amp;ml_name=SSI&amp;ml_version=0.17.2&amp;language=en-GB&amp;blq_s=3.5&amp;blq_r=3.5&amp;blq_v=journalism-domestic" height="1" width="1" alt="" /></p></noscript> <!-- End iStats (UX-CMC) -->   <div id="blq-global"> <noscript> <div id="blq-no-js-banner"> <p>For a better experience on your device, try our <a href="http://m.bbc.co.uk">mobile site</a>.</p> </div> </noscript> <div id="blq-pre-mast" xml:lang="en-GB"> <!-- Pre mast -->  </div> </div>  <script type="text/html" id="blq-bbccookies-tmpl"><![CDATA[ <div id="bbccookies-prompt" class="bbccookies-d"> <h2> Cookies on the BBC website </h2> <p> We use cookies to ensure that we give you the best experience on our website.<span class="bbccookies-international-message"> We also use cookies to ensure we show you advertising that is relevant to you.</span> If you continue without changing your settings, we'll assume that you are happy to receive all cookies on the BBC website. However, if you would like to, you can <a href="/privacy/cookies/managing/cookie-settings.html">change your cookie settings</a> at any time. </p> <ul> <li id="bbccookies-continue"> <button type="button" id="bbccookies-continue-button">Continue</button> </li> <li id="bbccookies-more"><a href="/privacy/cookies/bbc">Find out more</a></li></ul> </div> ]]></script> <script type="text/javascript">/*<![CDATA[*/ (function(){if(bbccookies._showPrompt()){var i=document,b=i.getElementById("blq-pre-mast"),f=i.getElementById("blq-global"),h=i.getElementById("blq-container"),c=i.getElementById("blq-bbccookies-tmpl"),a,g,e;if(b&&i.createElement){a=i.createElement("div");a.id="bbccookies";e=c.innerHTML;e=e.replace("<"+"![CDATA[","").replace("]]"+">","");a.innerHTML=e;if(f){f.insertBefore(a,b)}else{h.insertBefore(a,b)}g=i.getElementById("bbccookies-continue-button");g.onclick=function(){a.parentNode.removeChild(a);return false};bbccookies._setPolicy()}}})(); /*]]>*/</script>  <div id="blq-masthead" class="blq-clearfix blq-mast-bg-transparent-light blq-lang-en-GB blq-ltr"> <span id="blq-mast-background"><span></span></span>  <div id="blq-mast" class="blq-rst">  <div id="blq-mast-bar" class="blq-masthead-container blq-journalism-domestic"> <div id="blq-blocks"> <a href="http://www.bbc.co.uk/" hreflang="en-GB"> <abbr title="British Broadcasting Corporation" class="blq-home"> <img src="BBC_News_Scotland_files/blq-blocks_grey_alpha.png" alt="BBC" height="24" width="84"> </abbr> </a> </div> <div id="blq-acc-links"> <h2 id="page-top">Accessibility links</h2> <ul>  <li><a href="#main-content">Skip to content</a></li>  <li><a href="#blq-local-nav">Skip to local navigation</a></li>  <li><a href="http://www.bbc.co.uk/accessibility/">Accessibility Help</a></li> </ul> </div> <div id="blq-sign-in" class="blq-gel">   <div style="display: block;" id="id-status" class="blq-id-v4"> <div class="id-out id-gel"> <h2 class="blq-hide">BBC iD</h2> <a class="blq-idstatus-signin" style="width: 111px; background-position: -16px center;" href="https://ssl.bbc.co.uk/id/signin?ptrt=http://www.bbc.co.uk/news/scotland/" id="blq-idstatus-link"> <span class="id-icon"> <span></span> </span> <span id="blq-idstatus-text">Sign in</span> <span class="id-spinner"></span> <span class="blq-dropdown-arrow"> <span></span> </span> </a> </div> <div style="width: 165px;" id="id-status-nav"> <div class="id-in blq-rst"> <ul> <li> <a name="idDash" class="blq-nogo has-ptrt idDash" href="https://ssl.bbc.co.uk/id/settings"> Settings </a> </li> <li> <a name="idSignout" class="blq-nogo has-ptrt idSignout" href="https://ssl.bbc.co.uk/id/signout?ptrt=http://www.bbc.co.uk/news/scotland/"> Sign out </a> </li> </ul> </div> </div> </div>          </div> <div role="navigation" id="blq-nav"> <h2>BBC navigation</h2>     <ul id="blq-nav-main">   <li id="blq-nav-news"> <a href="http://www.bbc.co.uk/news/">News</a> </li>    <li id="blq-nav-sport"> <a href="http://www.bbc.co.uk/sport/">Sport</a> </li>    <li id="blq-nav-weather"> <a href="http://www.bbc.co.uk/weather/">Weather</a> </li>    <li id="blq-nav-iplayer"> <a href="http://www.bbc.co.uk/iplayer/">iPlayer</a> </li>    <li id="blq-nav-tv"> <a href="http://www.bbc.co.uk/tv/">TV</a> </li>    <li id="blq-nav-radio"> <a href="http://www.bbc.co.uk/radio/">Radio</a> </li>    <li id="blq-nav-more"> <a href="http://www.bbc.co.uk/a-z/">More…</a> </li>   </ul>   <div id="blq-nav-search"> <form role="search" method="get" action="http://search.bbc.co.uk/search" accept-charset="utf-8" id="blq-search-form"> <div>  <input name="go" value="toolbar" type="hidden">  <input value="http://www.bbc.co.uk/news/scotland/" name="uri" type="hidden">    <input name="scope" value="news" type="hidden">  <label for="blq-search-q" class="blq-hide">Search term:</label> <input autocomplete="off" placeholder="Search" id="blq-search-q" name="q" maxlength="128" type="text"> <button id="blq-search-btn" type="submit"><span><img src="BBC_News_Scotland_files/blq-search_grey_alpha.png" alt="Search" height="13" width="13"></span></button> </div> </form> </div>  </div> </div> </div> <div style="display: none;" id="blq-panel" class="blq-rst"> <div style="display: none;" id="blq-panel-more" class="blq-masthead-container  blq-clearfix" xml:lang="en-GB" dir="ltr"> <div class="blq-panel-container panel-paneltype-more"> <div class="panel-header"> <h2> <a href="http://www.bbc.co.uk/a-z/">  More…  </a> </h2>  <a href="http://www.bbc.co.uk/a-z/" class="panel-header-links panel-header-link">Full A-Z<span class="blq-hide"> of BBC sites</span></a>  </div> <div class="panel-component panel-links">       <ul>   <li> <a href="http://www.bbc.co.uk/cbbc/">CBBC</a> </li>    <li> <a href="http://www.bbc.co.uk/cbeebies/">CBeebies</a> </li>    <li> <a href="http://www.bbc.co.uk/comedy/">Comedy</a> </li>   </ul>  <ul>   <li> <a href="http://www.bbc.co.uk/food/">Food</a> </li>    <li> <a href="http://www.bbc.co.uk/history/">History</a> </li>    <li> <a href="http://www.bbc.co.uk/learning/">Learning</a> </li>   </ul>  <ul>   <li> <a href="http://www.bbc.co.uk/music/">Music</a> </li>    <li> <a href="http://www.bbc.co.uk/science/">Science</a> </li>    <li> <a href="http://www.bbc.co.uk/nature/">Nature</a> </li>   </ul>  <ul>   <li> <a href="http://www.bbc.co.uk/local/">Local</a> </li>    <li> <a href="http://www.bbc.co.uk/travelnews/">Travel News</a> </li>   </ul>   </div> </div> </div> <div id="blq-panel-promo" class="blq-masthead-container"></div> </div></div> <div id="blq-container-outer" class="blq-journalism-domestic blq-ltr">  <div id="blq-container" class="blq-lang-en-GB"> <div id="blq-container-inner" xml:lang="en-GB"><div style="display: none;" class="blq-reset pulse-pop" id="pulse-container"><div id="pulse-q"><p>We are always looking to improve the site and your opinions count.</p><p><strong>Do you have a few minutes to tell us what you think about this site?</strong></p></div><ul id="pulse-a" class="blq-clearfix"><li><a id="pulse-accept" href="https://ecustomeropinions.com/clients/bbc/pulse?url=http%3A%2F%2Fwww.bbc.co.uk%2Fnews%2Fscotland%2F&amp;data=yes&amp;data2=news-scotland">Yes</a></li><li id="pulse-reject-container"><a id="pulse-reject" href="#">No</a></li></ul><a href="#" id="pulse-close">x</a></div>   <div id="blq-main" class="blq-clearfix">   
-	
-
-                        		    	
-		<div class="scotland  has-no-ticker  main-content-container">
-			<div id="header-wrapper">
-
-			  
-    		      			      				  <h1 id="header">
-    			      	            <a rel="index" href="http://www.bbc.co.uk/news/"><img alt="BBC News" src="BBC_News_Scotland_files/news-blocks.gif"></a>
-    	                	            	    	            		<span class="section-title">Scotland</span>
-    	            	    	            		   		      				  </h1>
-    			  			  
-			  
-			    <div class="bbccom_advert_placeholder">
-			      <script type="text/javascript">$render("advert","advert-sponsor-section");</script>
-			    </div>
-			    <script type="text/javascript">$render("advert-post-script-load");</script>
-			  
-		                    	  				    <a href="http://feeds.bbci.co.uk/news/scotland/rss.xml" id="rss-alternative">
-    				  RSS<span class="gvl3-icon gvl3-icon-rss"> feed</span>
-    			     </a>
-                                  
-
-			  	            <div id="blq-local-nav">
- 	            <ul id="nav" class="nav">
-                	
-        	        		                	
-        	            	<li class="first-child "><a href="http://www.bbc.co.uk/news/">Home</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/world/">World</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/uk/">UK</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/england/">England</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/northern_ireland/">N. Ireland</a></li>
-                            	
-        	        	        	
-        	        		<li class="selected"><a href="http://www.bbc.co.uk/news/scotland/">Scotland</a></li>
-        		        		                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/wales/">Wales</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/business/">Business</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/politics/">Politics</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/health/">Health</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/education/">Education</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/science_and_environment/">Sci/Environment</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/technology/">Technology</a></li>
-                            	
-        	        	        	
-        	            	<li><a href="http://www.bbc.co.uk/news/entertainment_and_arts/">Entertainment &amp; Arts</a></li>
-                            </ul> 
-        
-	    	      <ul id="sub-nav" class="nav"> 
-	        	        			        	        	
-	        		            	<li class="first-child "><a href="http://www.bbc.co.uk/news/scotland/scotland_politics/">Scotland Politics</a></li>
-	            	        	        		        	
-	        		            	<li><a href="http://www.bbc.co.uk/news/scotland/scotland_business/">Scotland Business</a></li>
-	            	        	        		        	
-	        		            	<li><a href="http://www.bbc.co.uk/news/scotland/edinburgh_east_and_fife/">Edinburgh, Fife &amp; East</a></li>
-	            	        	        		        	
-	        		            	<li><a href="http://www.bbc.co.uk/news/scotland/glasgow_and_west/">Glasgow &amp; West</a></li>
-	            	        	        		        	
-	        		            	<li><a href="http://www.bbc.co.uk/news/scotland/highlands_and_islands/">Highlands &amp; Islands</a></li>
-	            	        	        		        	
-	        		            	<li><a href="http://www.bbc.co.uk/news/scotland/north_east_orkney_and_shetland/">NE, Orkney &amp; Shetland</a></li>
-	            	        	        		        	
-	        		            	<li><a href="http://www.bbc.co.uk/news/scotland/south_scotland/">South</a></li>
-	            	        	        		        	
-	        		            	<li><a href="http://www.bbc.co.uk/news/scotland/tayside_and_central/">Tayside &amp; Central</a></li>
-	            	        	      </ul> 
-	    	</div>
-
-			  
-	        </div>
-	        <!-- START CPS_SITE CLASS: domestic -->
-	        <div id="content-wrapper" class="domestic">
-
-					<div class="advert">
-													
-							<div class="bbccom_advert_placeholder">
-								<script type="text/javascript">$render("advert","advert-leaderboard");</script>
-							</div>
-							<script type="text/javascript">$render("advert-post-script-load");</script>
-							
-											</div>
-                                            <div id="bbccom_custom_branding">
-    <script type="text/javascript">
-        /*<![CDATA[*/
-        if(typeof bbcdotcom.objects('bbcdotcom.branding.init') === 'function') {
-            bbcdotcom.branding.init(BBC.adverts.getZoneData().zone, BBC.adverts.getAdKeyword());
-            bbcdotcom.branding.write();
-        }
-        /*]]>*/
-    </script>
-	<div class="bbccom_advert_placeholder">
-    <script type="text/javascript">
-        /*<![CDATA[*/
-        $render("advert","advert-sponsor-subsection");
-        /*]]>*/
-    </script>
-</div>
-</div>
-<script type="text/javascript">
-        /*<![CDATA[*/
-	$render("advert-post-script-load");
-        /*]]>*/
-</script>
-                    
-	            <!-- START CPS_SITE CLASS: index -->
-	            <div id="main-content" class="index blq-clearfix">
-			<!-- No EWA -->
-
-
-
-	
-<div id="full-width" class="container-full-width">
-	<div class="index-date">
-    <span class="date">31 March 2014</span>
-<span class="time-text">Last updated at </span><span class="time">14:53</span>
-	
-</div>
-
-	<div id="full-width-include" class="include-only special-event-promotion-full-width">
-    	
-</div>
-<script type="text/javascript">$render("full-width-include","full-width-include");</script> 
-</div>
-<script type="text/javascript">$render("container-full-width","full-width");</script> 
-	
-<div id="now" class="container-now">
-	
-<div id="container-top-stories-with-splash" class="container-top-stories">
-	
-
-  
-  
-  
-  
-  <div id="top-story" class="large-image">
-          
-                <h2 class="top-story-header ">
-      <a class="story" rel="published-1396263959196" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26818852">
-      Australian firm to create 110 jobs<img src="BBC_News_Scotland_files/_73924044_73921422.jpg" alt="Clough graphic"></a>
-    </h2>
-        
-        <p>An Australian firm which services the energy, chemical and 
-mining sectors is to open a base in Scotland, creating 110 new jobs.    
-	
-        <span id="dna-comment-count___CPS__26818852" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-              <hr>
-   </div>
-   <script type="text/javascript">$render("top-story","top-story");</script>
-
-	
-  
-  
-  
-  
-  
-    
-  <div id="second-story" class="secondary-top-story">
-
-                    <div class="large-image">
-                    
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h2 class=" secondary-story-header">
-	<a class="story" rel="published-1396257853848" href="http://www.bbc.co.uk/sport/0/football/26817253"><img src="BBC_News_Scotland_files/_73921211_9789351.jpg" alt="Hearts won Sunday's Edinburgh derby 2-0">Hearts in fight to stay afloat</a>
-
-		</h2>
-
-
-        <p>Hearts face the prospect of running out of money by the end 
-of April if a deal to take the club out of administration is not agreed 
-soon. 		  <a class="from-external-source" href="http://www.bbc.co.uk/sport/0/">BBC Sport</a>
-	
-        <span id="dna-comment-count___CPS__26817253" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-
-             </div>
-       </div>
-  <script type="text/javascript">$render("secondary-top-story","second-story");</script>
-
-
-
-	
-  
-  
-  
-  
-  
-    
-  <div id="third-story" class="secondary-top-story">
-
-                    <div class="large-image">
-                    
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h2 class=" secondary-story-header">
-	<a class="story" rel="published-1396224468575" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26809706"><img src="BBC_News_Scotland_files/_73909989_sthelena.jpg" alt="Primary school pupil with the Queen's baton in St Helena">Email error on Queen's Baton bearers</a>
-
-		</h2>
-
-
-        <p>Blank emails are sent to some people waiting to find out 
-whether they have been chosen to carry the Queen's Baton before the 
-Commonwealth Games. 	
-        <span id="dna-comment-count___CPS__26809706" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-
-              <ul class="see-also">
-
-                
-        
-                              
-                              
-          
-          
-
-
-
-
-	
-
-
-	
-
-
-
-
-<li class=" first-child column-1">
-	<a class="story" rel="published-1362999629235" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-21715076">Glasgow 2014: Queen's Baton Relay route</a>
-
-					
-	</li>
-
-                </ul>
-             </div>
-       </div>
-  <script type="text/javascript">$render("secondary-top-story","third-story");</script>
-
-
-
-	
-
-
-
-
-												
-				
-																							
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-									
-			
-			
-
-												
-				
-																					
-				
-																							
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-									
-			
-			
-
-												
-				
-																					
-				
-																					
-				
-																							
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-																					
-									
-		
-			
-							
-			
-							
-		
-<div id="other-top-stories" class="other-top-stories">
-			
-	<ul class="other-top-stories-stories">
-
-				  	
-												
-							
-																				
-				<li class="column-1 with-summary  first-child">
-
-                              					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396270342949" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26822671">Appeal over missing sex offender</a>
-
-		</h2>
-
-																	
-											<p>Police appeal for help in tracing a registered sex offender in Glasgow after he failed to turn up at court. 	
-						<span id="dna-comment-count___CPS__26822671" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-									</li>
-							  	
-									
-							
-																				
-				<li class="column-1 with-summary ">
-
-                              					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396254065307" href="http://www.bbc.co.uk/news/uk-scotland-north-east-orkney-shetland-26817350">Tributes to mother killed in crash</a>
-
-		</h2>
-
-																	
-											<p>Tribute is paid by relatives to a "wonderful" 28-year-old mother who was killed in a crash on the A96 in Moray. 	
-						<span id="dna-comment-count___CPS__26817350" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-									</li>
-							  	
-									
-							
-																				
-				<li class="column-1 with-summary ">
-
-                              					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396223037169" href="http://www.bbc.co.uk/news/uk-scotland-26809698">RSPB in climate change action call</a>
-
-		</h2>
-
-																	
-											<p>RSPB Scotland calls for more action on climate change 
-after a report suggests it is having a big impact on some species and 
-habitats. 	
-						<span id="dna-comment-count___CPS__26809698" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396223286998" href="http://www.bbc.co.uk/news/uk-scotland-scotland-business-26784889">Construction bosses 'more confident'</a>
-
-	<span id="dna-comment-count___CPS__26784889" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396258447212" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26818850">Woman dies after Bearsden road crash</a>
-
-	<span id="dna-comment-count___CPS__26818850" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396266982108" href="http://www.bbc.co.uk/news/uk-scotland-north-east-orkney-shetland-26817356">Former So Solid Crew member fined</a>
-
-	<span id="dna-comment-count___CPS__26817356" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396268477121" href="http://www.bbc.co.uk/news/uk-scotland-highlands-islands-26821234">Skiing until end of April 'possible'</a>
-
-	<span id="dna-comment-count___CPS__26821234" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396262674922" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26819151">Festival to pay tribute to Iain Banks</a>
-
-	<span id="dna-comment-count___CPS__26819151" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396223668919" href="http://www.bbc.co.uk/news/uk-scotland-edinburgh-east-fife-26813170">Panda mating preparations begin</a>
-
-	<span id="dna-comment-count___CPS__26813170" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396270258777" href="http://www.bbc.co.uk/news/business-26820844">Scottish farm incomes 'fell by 34%'</a>
-
-	<span id="dna-comment-count___CPS__26820844" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396265702158" href="http://www.bbc.co.uk/news/uk-scotland-north-east-orkney-shetland-26817357">Asbo-style notice for wind farm</a>
-
-	<span id="dna-comment-count___CPS__26817357" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396255144943" href="http://www.bbc.co.uk/news/uk-scotland-highlands-islands-26817523">Armed robbery at city betting shop</a>
-
-	<span id="dna-comment-count___CPS__26817523" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-							  	
-									
-							
-																
-				<li class="column-2 ">
-
-                    					
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h2>
-	<a class="story" rel="published-1396265299461" href="http://www.bbc.co.uk/news/uk-scotland-scotland-politics-26819160">Scottish SPCA could get new powers</a>
-
-	<span id="dna-comment-count___CPS__26819160" class="gvl3-icon gvl3-icon-comment comment-count"></span>				
-	</h2>
-
-																	
-									</li>
-						</ul>
-</div>
-<script type="text/javascript">$render("other-top-stories","other-top-stories");</script>
-
-</div>
-<script type="text/javascript">$render("container-top-stories","container-top-stories-with-splash");</script> 
-	
-			
-	<div id="also-in-the-news" class="also-in-news">
-					<h2 class="also-in-news-header">Also In The News</h2>
-			
-		<ul>
-					
-										
-					
-									
-			
-							<li class="small-image column-1 first-child">
-
-					
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3>
-	<a class="story" rel="published-1396266463348" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26817726"><img src="BBC_News_Scotland_files/_73924867_177533960.jpg" alt="TV studio">Glasgow TV channel to launch in June</a>
-
-					
-	</h3>
-
-				</li>
-						
-			
-					
-										
-					
-						
-			
-							<li class="small-image column-2 ">
-
-					
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3>
-	<a class="story" rel="published-1396261698130" href="http://www.bbc.co.uk/news/uk-scotland-tayside-central-26819000"><img src="BBC_News_Scotland_files/_73922520_00408cf54880_31-03-2014_07-22-54.jpg" alt="Lady the osprey">'Oldest breeding osprey' returns</a>
-
-					
-	</h3>
-
-				</li>
-						
-			
-				</ul>
-		
-	</div>
-	<script type="text/javascript">$render("also-in-news","also-in-the-news");</script>
-
-	
-
-
-
-
-
-		  				<h2 class="geo-digest-solo-header">More news from around Scotland</h2>
-				
-	<div class="container-geographic-regions-generic ">
-			
-			
-	<div id="geo-scotland-news-digest" class="geo-digest-region   ">
-	
-		
-							
-								
-					
-									
-								
-								
-								
-								
-								
-			
-			<div class="include-wrapper column-2">
-	<div id="scotland-map"></div>
-</div>
-		
-				
-									
-						
-					
-
-
-
-<div class="geo-digest-section column-1 ">
-	   	
-        <h4 class="geo-digest-section-header"><a class="story" href="http://www.bbc.co.uk/news/scotland/edinburgh_east_and_fife/">Edinburgh, Fife &amp; East </a>
-               </h4>
-   
-   
-   <ul>
-   	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396224652753" href="http://www.bbc.co.uk/news/uk-scotland-edinburgh-east-fife-26786483">Altitude sickness 'two illnesses'</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396257164391" href="http://www.bbc.co.uk/news/uk-scotland-edinburgh-east-fife-26817846">Remains confirmed as missing woman</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396188849503" href="http://www.bbc.co.uk/news/uk-scotland-edinburgh-east-fife-26810876">Fatal crash victim named by police</a>
-
-					
-	</li>
-
-  	 	  	   </ul>
-</div>
-
-									
-						
-					
-
-
-
-<div class="geo-digest-section column-1 ">
-	   	
-        <h4 class="geo-digest-section-header"><a class="story" href="http://www.bbc.co.uk/news/scotland/glasgow_and_west/">Glasgow &amp; West</a>
-               </h4>
-   
-   
-   <ul>
-   	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396266681502" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26818856">Police name man who died in crash</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396266748664" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26818853">Drug trafficker to lose £214,000</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396223087266" href="http://www.bbc.co.uk/news/magazine-26784218">Grant Morrison and Rian Hughes: The story behind 'The Key'</a>
-
-					
-	</li>
-
-  	 	  	   </ul>
-</div>
-
-									
-						
-					
-
-
-
-<div class="geo-digest-section column-1 ">
-	   	
-        <h4 class="geo-digest-section-header"><a class="story" href="http://www.bbc.co.uk/news/scotland/highlands_and_islands/">Highlands &amp; Islands</a>
-               </h4>
-   
-   
-   <ul>
-   	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396264053634" href="http://www.bbc.co.uk/news/uk-scotland-highlands-islands-26817532">Flybe looks at Inverness-London link</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396168790221" href="http://www.bbc.co.uk/news/uk-scotland-highlands-islands-26808335">Climber dies after Ben Nevis fall</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396062035656" href="http://www.bbc.co.uk/news/magazine-26646648">Meet the aurora hunters</a>
-
-					
-	</li>
-
-  	 	  	   </ul>
-</div>
-
-									
-						
-					
-
-
-
-<div class="geo-digest-section column-1 ">
-	   	
-        <h4 class="geo-digest-section-header"><a class="story" href="http://www.bbc.co.uk/news/scotland/north_east_orkney_and_shetland/">North East, Orkney &amp; Shetland</a>
-               </h4>
-   
-   
-   <ul>
-   	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396267930233" href="http://www.bbc.co.uk/news/uk-scotland-scotland-business-26820748">Ithaca Energy profits rise to £87m</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396271583799" href="http://www.bbc.co.uk/news/uk-scotland-tayside-central-26819725">New £2.5m rig base for Dundee port</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396018957956" href="http://www.bbc.co.uk/news/uk-scotland-north-east-orkney-shetland-26790169">Fans 'swear at own team' on Twitter</a>
-
-					
-	</li>
-
-  	 	  	   </ul>
-</div>
-
-									
-								
-					
-
-
-
-<div class="geo-digest-section column-2 column-top">
-	   	
-        <h4 class="geo-digest-section-header"><a class="story" href="http://www.bbc.co.uk/news/scotland/south_scotland/">South Scotland</a>
-               </h4>
-   
-   
-   <ul>
-   	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396268083608" href="http://www.bbc.co.uk/news/uk-scotland-south-scotland-26821780">Ice hockey event nets revenue boost</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396224023029" href="http://www.bbc.co.uk/news/uk-scotland-south-scotland-26787316">Bypass cuts risk of 'bridge strikes'</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396224209908" href="http://www.bbc.co.uk/news/uk-scotland-scotland-business-26766606">Tyre firm looks to fund expansion</a>
-
-					
-	</li>
-
-  	 	  	   </ul>
-</div>
-
-									
-						
-					
-
-
-
-<div class="geo-digest-section column-2 ">
-	   	
-        <h4 class="geo-digest-section-header"><a class="story" href="http://www.bbc.co.uk/news/scotland/tayside_and_central/">Tayside &amp; Central</a>
-               </h4>
-   
-   
-   <ul>
-   	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1396265993948" href="http://www.bbc.co.uk/news/uk-scotland-tayside-central-26819717">Drunk teen tried to ram police car</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1395819536244" href="http://www.bbc.co.uk/news/uk-scotland-26744672">Scaling back the many-headed Hydro</a>
-
-					
-	</li>
-
-  	 	  	   		  	 		
-
-
-
-
-	
-
-
-		
-
-
-
-
-<li>
-	<a class="story" rel="published-1395401126525" href="http://www.bbc.co.uk/news/uk-scotland-tayside-central-26681537">Escaped police dog sparks search</a>
-
-					
-	</li>
-
-  	 	  	   </ul>
-</div>
-
-				
-	</div>
-	</div>
-
-
-	<div id="correspondent-strapline" class="generic-include-container correspondent-promotion-now">
-    	 	<h2 class="top-level-heading"><a href="http://www.bbc.co.uk/news/correspondents/douglasfraser/">Our Experts</a></h2>
-		
-			<div class="correspondent-promo">
-	<div class="correspondent-promo-inner">
-		<a href="http://www.bbc.co.uk/news/correspondents/briantaylor">
-			<span class="correspondent-portrait"><img src="BBC_News_Scotland_files/_58403028_97e82173-3495-4540-b671-f1464030139c.jpg" alt="Brian Taylor, Political editor, Scotland" height="104" width="144"></span>
-			<span class="promo-lead-in">Article written by  Brian Taylor</span>
-			<span class="name">Brian Taylor</span>
-		</a>
-		<span class="bbc-role">Political editor, Scotland</span>
-		<ul class="social-links">
-			<li><a rel="me" href="http://www.bbc.co.uk/news/correspondents/briantaylor">More from Brian</a></li>
-			
-		</ul>
-		
-
-        
-    <div class="correspondent-promo-item">
-                
-        <div class="article blog pinned">
-            <h2><a href="http://www.bbc.co.uk/news/uk-scotland-26800420">Contest of doubt and reassurance </a></h2>
-            <p class="article-date">11:47 UK time, Saturday, 29 March 2014</p>
-            <p>Does it matter that an unnamed minister of unknown status
- follows an undiscernible motivation and gives an off-the-record comment
- to The Guardian? Frankly, yes it does.  </p>
-            <p>
-                <a href="http://www.bbc.co.uk/news/uk-scotland-26800420">                    
-                                                Read full article                                        </a>
-            </p>
-        </div>
-        
-                
-    </div>
-
-
-
-
-	</div>
-</div>
-
-			<div class="correspondent-promo">
-	<div class="correspondent-promo-inner">
-		<a href="http://www.bbc.co.uk/news/correspondents/douglasfraser">
-			<span class="correspondent-portrait"><img src="BBC_News_Scotland_files/_58405014_4290d1a0-7725-4113-a5e1-a14e4c5885c3.jpg" alt="Douglas Fraser, Business and economy editor, Scotland" height="104" width="144"></span>
-			<span class="promo-lead-in">Article written by  Douglas Fraser</span>
-			<span class="name">Douglas Fraser</span>
-		</a>
-		<span class="bbc-role">Business and economy editor, Scotland</span>
-		<ul class="social-links">
-			<li><a rel="me" href="http://www.bbc.co.uk/news/correspondents/douglasfraser">More from Douglas</a></li>
-			
-		</ul>
-		
-
-        
-    <div class="correspondent-promo-item">
-                
-        <div class="article blog pinned">
-            <h2><a href="http://www.bbc.co.uk/news/uk-scotland-26815714">Has Scotland ‘de-globalised’? </a></h2>
-            <p class="article-date">8 hours ago</p>
-            <p>As Scotland looks to a choice on its future, two academic
- contributions give us a new take on the economic route that got us to 
-where we are now.</p>
-            <p>
-                <a href="http://www.bbc.co.uk/news/uk-scotland-26815714">                    
-                                                Read full article                                        </a>
-            </p>
-        </div>
-        
-                
-    </div>
-
-
-
-
-	</div>
-</div>
-
-	</div>
-<script type="text/javascript">$render("correspondent-strapline","correspondent-strapline");</script>
-	
-<div id="special-reports" class="special-reports-component">
-    
-  <h2 class="special-reports-header">
-          	<a href="http://www.bbc.co.uk/news/16630456">Special Reports</a>
-        </h2>
-  
-
-  <div class="special-reports-wrapper">
-		  
-	  		  		
-	  		
-	  		
-	  		<div class="top-report">
-		      
-	 		  
-
-
-
-
-
-
-
-
-
-
-
-
-<h3>
-	<a class="story" rel="published-1358350237871" href="http://www.bbc.co.uk/news/16630456"><img src="BBC_News_Scotland_files/_69185589_flags4.jpg" alt="Saltire and union flags">Scotland's Future</a>
-
-		</h3>
-
-			  	<p>Latest news, background and analysis on the referendum on Scottish independence</p>
-			  	
-				
-				<div class="bbccom_advert_placeholder">
-					<script type="text/javascript">$render("advert","advert-sponsor-module","special-reports","scotlands-future");</script>
-				</div>
-				<script type="text/javascript">$render("advert-post-script-load");</script>
-				 
-	 		</div>
-  			  		  
-	  	  			  			<div class="more-special-reports">
-  			  <h3 class="more">More Special Reports:</h3>
-  			  <ul>
-  			  				<li>
-  					
-
-
-
-
-
-
-
-
-
-
-
-
-<h4>
-	<a class="story" rel="published-1381312793289" href="http://www.bbc.co.uk/news/world-24733934">Queen's Baton Relay</a>
-
-		</h4>
-
-				</li>
-  			
-  			  			  </ul>
-  			  </div>
-			  			  	  </div>
-  
-</div>
-<script type="text/javascript">$render("special-reports","special-reports");</script> 
-
-
-	
-<div id="category-digests" class="container-category-digests digest-multiple">
-	
-			<h2 class="digest-wrapper-header">More from Scotland</h2>
-	
-			
-								
-						
-		
-
-
-
-<div class="digest  first-child">
-	
-			<h3 class="digest-header"><a href="http://www.bbc.co.uk/news/scotland/scotland_politics/">Politics</a></h3>
-	  	
-  	<ul>
-  		  			
-  			  			
-  			  								  				
-  				<li class="medium-image first-child">
-	 				
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396155748874" href="http://www.bbc.co.uk/news/uk-scotland-26807255"><img src="BBC_News_Scotland_files/_73910280_carmichael_pa.jpg" alt="Alistair Carmichael">Scottish Yes vote 'not impossible'</a>
-
-		</h4>
-
-	 				
-	 					 					<p>It is "absolutely the case" that the Yes campaign could 
-win the independence referendum, Scottish Secretary Alistair Carmichael 
-warns.</p>
-	 					 				<hr>
-	  			</li>
-
-  			  		  			
-  			  			
-  				  			
-	  				  				  			  			
-	 			<li class="column-1">
-	 				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396053207489" href="http://www.bbc.co.uk/news/uk-scotland-scotland-politics-26791759">Rennie critical of justice overhaul</a>
-
-		</h4>
-
-	  			</li>
-  			  		  			
-  			  			
-  				  			
-	  				  				  			  			
-	 			<li class="column-2">
-	 				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396137694358" href="http://www.bbc.co.uk/news/uk-scotland-26803331">Dentists to be given defibrillators</a>
-
-		</h4>
-
-	  			</li>
-  			  		 	</ul>
-</div>
-
-			
-						
-						
-		
-
-
-
-<div class="digest  ">
-	
-			<h3 class="digest-header"><a href="http://www.bbc.co.uk/news/scotland/scotland_business/">Business</a></h3>
-	  	
-  	<ul>
-  		  			
-  			  			
-  			  								  				
-  				<li class="medium-image first-child">
-	 				
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396223830934" href="http://www.bbc.co.uk/news/uk-scotland-scotland-business-26810879"><img src="BBC_News_Scotland_files/_73913572_176640491.jpg" alt="Office desk">Small firms split over independence</a>
-
-		</h4>
-
-	 				
-	 					 					<p>Scotland's small business owners are divided over the possible implications of independence, a survey has suggested.</p>
-	 					 				<hr>
-	  			</li>
-
-  			  		  			
-  			  			
-  				  			
-	  				  				  			  			
-	 			<li class="column-1">
-	 				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396243904074" href="http://www.bbc.co.uk/news/uk-scotland-26812504">Experts discuss young workforce plan</a>
-
-		</h4>
-
-	  			</li>
-  			  		  			
-  			  			
-  				  			
-	  				  				  			  			
-	 			<li class="column-2">
-	 				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396042706378" href="http://www.bbc.co.uk/news/uk-scotland-scotland-politics-26791763">Osborne denies currency deal claim</a>
-
-		</h4>
-
-	  			</li>
-  			  		 	</ul>
-</div>
-
-			
-						
-						
-		
-
-
-
-<div class="digest  ">
-	
-			<h3 class="digest-header"><a href="http://www.bbc.co.uk/sport1/hi/scotland/default.stm">Sport</a></h3>
-	  	
-  	<ul>
-  		  			
-  			  			
-  			  								  				
-  				<li class="medium-image first-child">
-	 				
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396266373324" href="http://www.bbc.co.uk/sport/0/football/26821335"><img src="BBC_News_Scotland_files/_73925590_6101831.jpg" alt="Craig Gordon last played for Scotland in November 2010">Gordon eyes return for Scotland</a>
-
-		</h4>
-
-	 				
-	 					 					<p>Scotland goalkeeper Craig Gordon is looking forward to reviving his career after two years on the sidelines through injury.</p>
-	 					 				<hr>
-	  			</li>
-
-  			  		  			
-  			  			
-  				  			
-	  				  				  			  			
-	 			<li class="column-1">
-	 				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396248538845" href="http://www.bbc.co.uk/sport/0/golf/26800602">Masters dream debut for Gallacher</a>
-
-		</h4>
-
-	  			</li>
-  			  		  			
-  			  			
-  				  			
-	  				  				  			  			
-	 			<li class="column-2">
-	 				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396261211982" href="http://www.bbc.co.uk/sport/0/football/26819868">Bolton extend Hutton loan deal</a>
-
-		</h4>
-
-	  			</li>
-  			  		 	</ul>
-</div>
-
-			
-						
-						
-		
-
-
-
-<div class="digest  ">
-	
-			<h3 class="digest-header"><a href="http://www.bbc.co.uk/naidheachdan/">Naidheachdan</a></h3>
-	  	
-  	<ul>
-  		  			
-  			  			
-  			  								  				
-  				<li class="medium-image first-child">
-	 				
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396259651220" href="http://www.bbc.co.uk/naidheachdan/26785718"><img src="BBC_News_Scotland_files/_73921983_bookies.jpg" alt="Càr poilis ann an Inbhir Nis">Mèirle le gunna an Inbhir Nis ga rannsachadh</a>
-
-		</h4>
-
-	 				
-	 					 					<p>Poilis ann an Inbhir Nis a' sireadh fear a rinn mèirle le gunna ann am bùth gheall sa bhaile.</p>
-	 					 				<hr>
-	  			</li>
-
-  			  		  			
-  			  			
-  				  			
-	  				  				  			  			
-	 			<li class="column-1">
-	 				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396255651694" href="http://www.bbc.co.uk/naidheachdan/26817704">Fireannach air bàsachadh air Beinn Nibheis</a>
-
-		</h4>
-
-	  			</li>
-  			  		  			
-  			  			
-  				  			
-	  				  				  			  			
-	 			<li class="column-2">
-	 				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h4 class=" digest-story-header">
-	<a class="story" rel="published-1396002222272" href="http://www.bbc.co.uk/naidheachdan/26783316">Obair gus tòiseachadh air camarathan an A9</a>
-
-		</h4>
-
-	  			</li>
-  			  		 	</ul>
-</div>
-
-	</div>
-<script type="text/javascript">$render("container-generic-digests","category-digests");</script>
-
-	
-<div id="featured-other-site" class="container-featured-other-site">
- 	<h2 class="container-featured-other-site-heading"><a href="http://www.bbc.co.uk/democracylive/scotland/">Democracy Live</a></h2>
-	
-<div id="featured-site-top-stories" class="featured-site-top-stories">
-
-	
-	<ul>
-			
-													
-					
-						
-		<li class=" medium-image first-child">
-
-			
-
-
-
-
-		
-
-
-		
-
-
-
-
-<h3>
-	<a class="story" rel="published-1395926962010" href="http://www.bbc.co.uk/democracylive/scotland-26759524"><img src="BBC_News_Scotland_files/_73851791_73849714.jpg" alt="First Minister Alex Salmond">Scottish Parliament<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span></a>
-
-		</h3>
-
-			
-						  <p>Alex Salmond welcomes the 'big six' energy firms having to face a competition inquiry.</p>
-						
-			
-			
-						
-						  <hr>
-					</li>
-	
-		</ul>
-</div>
-<script type="text/javascript">$render("featured-site-top-stories","featured-site-top-stories");</script> 
-
-	<div id="featured-site-include-2" class="include-only featured-site-include">
-    	<div id="find-a-representative" class="find-a-representative">
-
-<div class="content-object-34">        
-        <h2>Find a representative</h2>                                            
-       	<div class="content-container">       		
-       		<form id="content-object-34-form" method="get" action="http://news.bbc.co.uk/democracylive/hi/representatives/search">        	                       
-	    	    <fieldset class="content-object-34-fieldset">        	                        
-    		      	<legend><span>Search terms</span></legend>        	                        
-        	        <label class="" for="content-object-34-form-keyword">Enter the name of who you are looking for, a place or full postcode e.g. CF10 3NQ</label>
-        	        <input id="content-object-34-form-keyword" name="q" class="input" type="text">
-        	        <input id="search-type" name="type" value="representatives" type="hidden">        	                       	                             
-        	      	<input value="Search" class="submit" id="content-object-34-form-submit" type="submit">   			
-  	    	    </fieldset>        	                                                       
-    		</form>
-        </div>                
-   	</div>
-
-</div>
-<script type="text/javascript">
-$render("hide-text-input-labels","find-a-representative");
-gloader.load(["glow","1","glow.forms"],{onLoad: function(glow){new glow.forms.Form("#content-object-34-form").addTests("q", ["required"]);}});
-</script> 
-</div>
-<script type="text/javascript">$render("featured-site-include-2","featured-site-include-2");</script> 
-</div>
-<script type="text/javascript">$render("container-featured-other-site","featured-other-site");</script> 
-	
-
-
-<div id="useful-links" class="useful-links ">
-		 	<h2 class="useful-links-header"><a href="http://www.bbc.co.uk/scotland/">From BBC Scotland</a></h2>
-		<ul>
-				  	
-												
-									
-												
-			<li class="column-1  first-child column-top">
-
-				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3>
-	<a class="story" href="http://www.bbc.co.uk/radioscotland/">Radio Scotland</a>
-
-		</h3>
-
-			</li>
-	
-				  	
-									
-									
-									
-			<li class="column-1  ">
-
-				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3>
-	<a class="story" href="http://www.bbc.co.uk/alba/">BBC Alba</a>
-
-		</h3>
-
-			</li>
-	
-				  	
-									
-									
-									
-			<li class="column-1  ">
-
-				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3>
-	<a class="story" href="http://www.bbc.co.uk/radionangaidheal/">Radio nan Gàidheal</a>
-
-		</h3>
-
-			</li>
-	
-				  	
-									
-									
-									
-			<li class="column-1  ">
-
-				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3>
-	<a class="story" href="http://www.bbc.co.uk/scotland/learning/">Learning Scotland</a>
-
-		</h3>
-
-			</li>
-	
-				  	
-									
-												
-												
-			<li class="column-2  column-top">
-
-				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3>
-	<a class="story" href="http://www.bbc.co.uk/alba/ceol/">Ceòl/Music</a>
-
-		</h3>
-
-			</li>
-	
-				  	
-									
-												
-									
-			<li class="column-2  ">
-
-				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3>
-	<a class="story" href="http://www.bbc.co.uk/scotland/music/">Scotland's Music</a>
-
-		</h3>
-
-			</li>
-	
-				  	
-									
-												
-									
-			<li class="column-2  ">
-
-				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3>
-	<a class="story" href="http://www.bbc.co.uk/scotland/landscapes/">Scotland's Landscape</a>
-
-		</h3>
-
-			</li>
-	
-				  	
-									
-												
-									
-			<li class="column-2  ">
-
-				
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3>
-	<a class="story" href="http://www.bbc.co.uk/robertburns/">Robert Burns</a>
-
-		</h3>
-
-			</li>
-	
-			</ul>
-</div>
-
-
-</div>
-<script type="text/javascript">$render("container-now","now");</script> 
-	
-<div id="best" class="container-best">
-	
-<div id="promo-best" class="container-promo-best">
-	
-<div class="bbccom_advert_placeholder">
-	<script type="text/javascript">$render("advert","advert-mpu-high");</script>
-</div>
-<script type="text/javascript">$render("advert-post-script-load");</script>
-
-	
-<div id="av-best" class="container-av-best">
-	
-																
-	<div id="av-stories-best" class="av-stories-best">
-		
-					<h2 class="av-best-header">Watch/Listen</h2>
-				
-				<div class="list-wrapper">
-		  
-            		  
-		  <div class="glow177-carousel gvl3-carousel"><div class="carousel-light"><div style="width: 288px; height: 124px;" class="carousel-window paged"><ul style="width: 1152px;" class="av-best-carousel carousel  carousel-content">
-			  				  					  					  					  
-
-
-
-
-		
-
-
-		
-
-
-
-
-<li style="" class=" first-child carousel-item ">
-	<a class="story" rel="published-1396116943248" href="http://www.bbc.co.uk/news/uk-scotland-26803761"><img src="BBC_News_Scotland_files/_73902066_reverend.jpg" alt="Reverend Dr Laurence Whitley">Chaplain: 'Clutha heartache still raw'<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span><span class="av-time">01:56</span></a>
-
-		</li>
-
-				  			  				  					  					  
-
-
-
-
-		
-
-
-		
-
-
-
-
-<li class="carousel-item ">
-	<a class="story" rel="published-1396020103115" href="http://www.bbc.co.uk/news/uk-scotland-scotland-politics-26787265"><img src="BBC_News_Scotland_files/_73880347_alasdairgray.jpg" alt="Alasdair Gray, author and artist">Gray on books, Yes vote and depression<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span><span class="av-time">09:37</span></a>
-
-		</li>
-
-				  			  				  					  					  
-
-
-
-
-		
-
-
-		
-
-
-
-
-<li class="carousel-item not-visible">
-	<a class="story" rel="published-1396042051997" href="http://www.bbc.co.uk/schoolreport/26794757"><img src="BBC_News_Scotland_files/_73892326_01sr14weatherpackage.jpg" alt="BBC News School Report took over the Nations and Regions weather bulletins">Student forecasters present weather<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span><span class="av-time">04:37</span></a>
-
-		</li>
-
-				  			  				  					  					  
-
-
-
-
-		
-
-
-		
-
-
-
-
-<li class="carousel-item not-visible">
-	<a class="story" rel="published-1396017238947" href="http://www.bbc.co.uk/news/uk-scotland-26784370"><img src="BBC_News_Scotland_files/_73881486_cluha2.jpg" alt="clutha friends">Clutha survivors talk about crash<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span><span class="av-time">03:15</span></a>
-
-		</li>
-
-				  			  				  					  					  
-
-
-
-
-		
-
-
-		
-
-
-
-
-<li class="carousel-item not-visible">
-	<a class="story" rel="published-1395908103544" href="http://www.bbc.co.uk/news/business-26763495"><img src="BBC_News_Scotland_files/_73841220_73827291.jpg" alt="graphic showing the 'big six'">'Energy firms need to rebuild trust'<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span><span class="av-time">02:10</span></a>
-
-		</li>
-
-				  			  				  					  					  
-
-
-
-
-		
-
-
-		
-
-
-
-
-<li class="carousel-item not-visible">
-	<a class="story" rel="published-1395912403323" href="http://www.bbc.co.uk/schoolreport/26566829"><img src="BBC_News_Scotland_files/_73703503_de30.jpg" alt="Boy on badminton court">Visiting Scotland's School of Sport<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span><span class="av-time">04:10</span></a>
-
-		</li>
-
-				  			  				  					  					  
-
-
-
-
-		
-
-
-		
-
-
-
-
-<li class="carousel-item not-visible">
-	<a class="story" rel="published-1395840700543" href="http://www.bbc.co.uk/news/uk-26745169"><img src="BBC_News_Scotland_files/_73824416_73818191.jpg" alt="Richard Durkin said the ruling was a victory for the consumer">Laptop ruling 'victory for consumer'<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span><span class="av-time">01:54</span></a>
-
-		</li>
-
-				  			  				  					  					  
-
-
-
-
-		
-
-
-		
-
-
-
-
-<li class="carousel-item not-visible">
-	<a class="story" rel="published-1395792551250" href="http://www.bbc.co.uk/news/magazine-26737284"><img src="BBC_News_Scotland_files/_73806716_73806589.jpg" alt="Mountain hare (c) Will Nicholls">Teen photographer's love for animals<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span><span class="av-time">03:19</span></a>
-
-		</li>
-
-				  			  		  </ul></div><ul style="margin-left: 96px;" class="pageNav"><li class="arrow carousel-prev-disabled" id="leftarrow"><a href="#" class="dotLabel">previous</a></li><li class="dot dot0 dotActive"><div class="dotLabel">1</div></li><li class="dot dot1"><div class="dotLabel">2</div></li><li class="dot dot2"><div class="dotLabel">3</div></li><li class="dot dot3"><div class="dotLabel">4</div></li><li class="arrow" id="rightarrow"><a href="#" class="dotLabel">next</a></li></ul></div></div>
-		</div>
-		
-	</div>
-
-	<script type="text/javascript">$render("av-stories-best","av-stories-best");</script>
-
-
-	
-<div id="av-live-streams" class="av-live-streams">
-	
-			  	
-						
-									<ul>
-													
-				
-			<li class=" first-child">
-
-				
-
-
-
-
-		
-
-
-	
-
-
-
-
-<h3 class="has-icon-boxedlive ">
-	<a class="story is-live" rel="published-1278944851948" href="http://www.bbc.co.uk/news/10318089">BBC News Channel<span class="gvl3-icon gvl3-icon-boxedlive"> Live</span></a>
-
-		</h3>
-
-			</li>
-				  	
-				
-							
-				
-			<li>
-
-				
-
-
-
-
-		
-
-
-	
-
-
-
-
-<h3 class="has-icon-boxedlive ">
-	<a class="story is-live" rel="published-1279810521521" href="http://news.bbc.co.uk/1/hi/help/7277283.stm" onclick="javascript: void window.open('http://www.bbc.co.uk/iplayer/console/bbc_radio_scotland', 'BBC', 'toolbar=0,scrollbars=0,location=0,statusbar=0,menubar=0,resizable=0,width=512,height=270,left=0,top=0'); return false;">BBC Radio Scotland <span class="gvl3-icon gvl3-icon-boxedlive"> Live</span></a>
-
-		</h3>
-
-			</li>
-						</ul>
-	</div>
-<script type="text/javascript">$render("av-live-streams","av-live-streams");</script>
-
-</div>
-<script type="text/javascript">$render("container-av-best","av-best");</script> 
-	
-<div class="bbccom_advert_placeholder">
-	<script type="text/javascript">$render("advert","advert-mpu-low");</script>
-</div>
-<script type="text/javascript">$render("advert-post-script-load");</script>
-
-</div>
-<script type="text/javascript">$render("container-promo-best","promo-best");</script> 
-	
-<div id="features-and-analysis" class="container-features-and-analysis">
- 		<h2 class="features-header">Features &amp; Analysis</h2>
-		
- <ul>
-  	
-
-
-		
-	
-						
-			<!-- Non specific version -->
-			
-			
-							
-									
-												
-		<li class="first-child large-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1396258295054" href="http://www.bbc.co.uk/news/uk-scotland-highlands-islands-26817526"><img src="BBC_News_Scotland_files/_73920849_markmedcalf.jpg" alt="Otter">Natural selection</a>
-
-		</h3>
-
-			
-							<p>The winning entries from the 2013 Scottish Nature Photography Awards 	
-				  <span id="dna-comment-count___CPS__26817526" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-					</li>
-	
-	 
-	
-	
-
-
-  	
-
-
-		
-	
-						
-			<!-- Non specific version -->
-			
-			
-							
-		<li class="medium-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1396225120997" href="http://www.bbc.co.uk/news/uk-scotland-tayside-central-26749122"><img src="BBC_News_Scotland_files/_73832235_16_blue00035.jpg" alt="Dundee waterfront image">Museum with a mission</a>
-
-		</h3>
-
-			
-							<p>V&amp;A Dundee aims to change thinking about design  	
-				  <span id="dna-comment-count___CPS__26749122" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-							<hr>
-					</li>
-			
-							
-		<li class="medium-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1396259119589" href="http://www.bbc.co.uk/news/uk-scotland-scotland-politics-26787241"><img src="BBC_News_Scotland_files/_73921954_mcleishhands_pa.jpg" alt="Henry McLeish, former Labour First Minister">Got a question?</a>
-
-		</h3>
-
-			
-							<p>Former FM Henry McLeish in the referendum webcast hot-seat 	
-				  <span id="dna-comment-count___CPS__26787241" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-							<hr>
-					</li>
-			
-							
-		<li class="medium-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1396223156644" href="http://www.bbc.co.uk/news/magazine-26730067"><img src="BBC_News_Scotland_files/_73929252_thekey-page5final.jpg" alt="A close up of her face, some sort of realisation dawns upon her. She stares open-mouthed.">The Key</a>
-
-		</h3>
-
-			
-							<p>A novel in graphic art form on the theme of freedom  	
-				  <span id="dna-comment-count___CPS__26730067" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-							<hr>
-					</li>
-			
-							
-		<li class="medium-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1396137699713" href="http://www.bbc.co.uk/news/uk-scotland-highlands-islands-26764969"><img src="BBC_News_Scotland_files/_73874302_postermontagefirst.jpg" alt="Belladrum poster illustrations">Art attack </a>
-
-		</h3>
-
-			
-							<p>How the idea for Belladrum's 50ft woman grew 	
-				  <span id="dna-comment-count___CPS__26764969" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-							<hr>
-					</li>
-			
-							
-		<li class="medium-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1396007990125" href="http://www.bbc.co.uk/news/uk-scotland-26785779"><img src="BBC_News_Scotland_files/_73906942_muckspreadingtorrofmoonzie.jpg" alt="Tractor spreading muck">Your pictures</a>
-
-		</h3>
-
-			
-							<p>A selection of your pictures taken across Scotland 	
-				  <span id="dna-comment-count___CPS__26785779" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-							<hr>
-					</li>
-			
-							
-		<li class="medium-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1393234575651" href="http://www.bbc.co.uk/news/uk-scotland-26322003"><img src="BBC_News_Scotland_files/_73918217_newspapers003.jpg" alt="Scotland's newspapers">Scottish papers</a>
-
-		</h3>
-
-			
-							<p>Newspaper review: Scotland's front pages 	
-				  <span id="dna-comment-count___CPS__26322003" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-							<hr>
-					</li>
-			
-							
-		<li class="medium-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1389203494700" href="http://www.bbc.co.uk/news/uk-scotland-scotland-politics-25657131"><img src="BBC_News_Scotland_files/_72161086_flags_seven.jpg" alt="Saltire and union flag">Join in</a>
-
-		</h3>
-
-			
-							<p>Apply to take part in a TV referendum debate 	
-				  <span id="dna-comment-count___CPS__25657131" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-							<hr>
-					</li>
-			
-							
-		<li class="medium-image">
-
-			
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1380190779983" href="https://twitter.com/BBCScotlandNews"><img src="BBC_News_Scotland_files/_70264710_tweetpromotoo.jpg" alt="Twitter">See our tweets</a>
-
-		</h3>
-
-			
-							<p>Follow the latest BBC Scotland News updates on Twitter 	
-				  <span id="dna-comment-count___CPS__24282939" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-							<hr>
-					</li>
-	
-	 
-	
-	
-
-
-  	
-
-
-		
-	
-						
-			<!-- Non specific version -->
-			
-			
-							
-		<li class="no-image">
-
-			
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1317896946303" href="http://www.bbc.co.uk/news/uk-scotland-scotland-politics-15190428">Take part</a>
-
-		</h3>
-
-			
-							<p>Join Brian Taylor's Big Debate audience  	
-				  <span id="dna-comment-count___CPS__15190428" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-					</li>
-			
-							
-		<li class="no-image">
-
-			
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1284383118762" href="http://www.bbc.co.uk/news/uk-scotland-11287381">Send us your pictures</a>
-
-		</h3>
-
-			
-							<p>How to send us your images from across Scotland  	
-				  <span id="dna-comment-count___CPS__11287381" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-					</li>
-			
-							
-		<li class="no-image">
-
-			
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1327399992043" href="http://www.bbc.co.uk/news/16630456">Scotland's future</a>
-
-		</h3>
-
-			
-							<p>Latest news, background and analysis on the 2014 referendum  	
-				  <span id="dna-comment-count___CPS__16698008" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-					</li>
-			
-							
-		<li class="no-image">
-
-			
-
-
-
-
-	
-
-
-	
-
-
-
-
-<h3 class=" feature-header">
-	<a class="story" rel="published-1394451701516" href="https://www.facebook.com/bbcscotlandnews">We're on Facebook</a>
-
-		</h3>
-
-			
-							<p>Join us to get the highlights from BBC Scotland news 	
-				  <span id="dna-comment-count___CPS__26516189" class="gvl3-icon gvl3-icon-comment comment-count"></span></p>
-			
-					</li>
-	
-	 
-	
-	
-
-
-   </ul>
-</div>
-<script type="text/javascript">$render("container-features-and-analysis","features-and-analysis");</script> 
-
-	<div id="special-event-promotion-best-promo-module-hyper" class="include-only special-event-promotion-best">
-    	
-
-
-
-
-  <!-- Empty hyperpuff -->
-
-</div>
-<script type="text/javascript">$render("special-event-promotion-best-promo-module-hyper","special-event-promotion-best-promo-module-hyper");</script> 
-	<div id="special-event-promotion-best-1-include" class="include-only special-event-promotion-best">
-    	
-
-
-
-
-		     <div class="hyperpuff">
-	       	         <div id="promotional-content" class="hyper-promotional-content">
-	
-			<h2>Elsewhere on the BBC</h2>
-	
-	<ul>
-							
-						
-															
-										<li class="medium-image first-child">
-				
-
-
-
-
-	
-
-
-		
-
-
-
-
-<h3>
-	<a class="story" rel="published-1382701763924" href="http://www.bbc.co.uk/news/world-24733934"><img src="BBC_News_Scotland_files/_70711014_70706025.jpg" alt="Queen's Baton Relay">Baton relay</a>
-
-		</h3>
-
-									<p>70 nations and territories, 288 days - Mark Beaumont travels the Commonwealth </p>
-							</li>
-			</ul>
-	
-	<div class="bbccom_advert_placeholder">
-	      <script type="text/javascript">$render("advert","advert-sponsor-module","hyper-promotional-content","baton-relay");</script>
-	</div>
-	<script type="text/javascript">$render("advert-post-script-load");</script>
- 
-</div>
-<script type="text/javascript">$render("hyper-promotional-content","promotional-content");</script>
-
-
-	       	     </div>
-	
-</div>
-<script type="text/javascript">$render("special-event-promotion-best-1-include","special-event-promotion-best-1-include");</script> 
-	
-<div id="most-popular-promotion" class="container-most-popular-promotion">
-	
-<div class="bbccom_advert_placeholder">
-	<script type="text/javascript">$render("advert","advert-partner-button");</script>
-</div>
-<script type="text/javascript">$render("advert-post-script-load");</script>
-
-	
-<div id="most-popular-category" class="livestats most-popular-category">
-
-			<h2 class="livestats-header">Most Popular</h2>
-		<h3 class="tab open">From Scotland in the last week</h3>
-	<div id="livestats-week" class="panel">
-  		<ol>
-      		<li class="ol">
-  <a class="story" href="http://www.bbc.co.uk/news/uk-scotland-26807255">
-    <span class="livestats-icon livestats-sunday">Sunday: </span>Scottish Yes vote 'not impossible'</a>
-</li><li class="ol">
-  <a class="story" href="http://www.bbc.co.uk/news/uk-scotland-glasgow-west-26787005">
-    <span class="livestats-icon livestats-saturday">Saturday: </span>Clutha victims remembered at service</a>
-</li><li class="ol">
-  <a class="story" href="http://www.bbc.co.uk/news/uk-scotland-north-east-orkney-shetland-26780189">
-    <span class="livestats-icon livestats-friday">Friday: </span>'Dambusters' squadron is disbanded</a>
-</li><li class="ol">
-  <a class="story" href="http://www.bbc.co.uk/news/uk-scotland-south-scotland-26756691">
-    <span class="livestats-icon livestats-thursday">Thursday: </span>'Asymmetric' school week approved</a>
-</li><li class="ol">
-  <a class="story" href="http://www.bbc.co.uk/news/uk-scotland-north-east-orkney-shetland-26731192">
-    <span class="livestats-icon livestats-wednesday">Wednesday: </span>Man wins 16-year laptop wrangle</a>
-</li>
-  		</ol>
-  	</div>
-	
-	<div class="bbccom_advert_placeholder">
-		<script type="text/javascript">$render("advert","advert-sponsor-module","most-popular-category","most-popular");</script>
-	</div>
-	<script type="text/javascript">$render("advert-post-script-load");</script>
-	
-    
-</div>
-
-<script type="text/javascript">$render("most-popular-category","most-popular-category");</script>
-</div>
-<script type="text/javascript">$render("container-most-popular-promotion","most-popular-promotion");</script> 
-	<div id="programmes-promotion" class="include-only programmes-promotion">
-    	
-
-
-
-
-		     <div class="hyperpuff">
-	       	             		  		    	
-    		    		    				  	    	  		  
-						  
-
-<div id="container-programme-promotion" class="container-programme-promotion">
-			<h2 class="programmes-header">Programmes</h2>
-		
-	
-		<a class="iplayer-branding" href="http://www.bbc.co.uk/iplayer/">BBC iPlayer</a>
-	
-
-								<ul class="programmes-standard">
-	
-				
-				
-	<li class="medium-image first-item">
-		
-
-
-
-
-		
-
-
-		
-
-
-
-
-<h3 class=" programme-header">
-	<a class="story" rel="published-1279103053874" href="http://www.bbc.co.uk/programmes/b006mj3s"><img src="BBC_News_Scotland_files/_48355416_b006mj3s_178_100.jpg" alt="Reporting Scotland">Reporting Scotland<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span></a>
-
-		</h3>
-
-		<p>The latest news and weather from around Scotland.</p>
-		<hr>
-	</li>														
-</ul>
-		
-					<div id="data-feed-best" class="include-only data-feed-best">
-    	<ul><li class="medium-image"><h4 class="programme-header"><a class="story" href="http://www.bbc.co.uk/iplayer/episode/b03yr40c/Good_Morning_Scotland_31_03_2014/"><img alt="Good Morning Scotland: 31/03/2014" src="BBC_News_Scotland_files/b03yr40c_150_84.jpg">Good Morning Scotland<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-listen"> Listen</span></span></a></h4><p>The nation's morning news programme with Gary Robertson, Hayley Millar and Jim Naughtie.</p><hr></li><li class="medium-image"><h4 class="programme-header"><a class="story" href="http://www.bbc.co.uk/iplayer/episode/b03zjbg2/Newsnight_Scotland_27_03_2014/"><img alt="Newsnight Scotland: 27/03/2014" src="BBC_News_Scotland_files/b03zjbg2_150_84.jpg">Newsnight Scotland<span class="gvl3-icon-wrapper"><span class="gvl3-icon gvl3-icon-invert-watch"> Watch</span></span></a></h4><p>Coverage of the day's national and international news stories, with Gordon Brewer.</p><hr></li></ul>
-</div>
-<script type="text/javascript">$render("data-feed-best","data-feed-best");</script> 		
-	</div>
-<script type="text/javascript">$render("container-programmes-promotion","container-programme-promotion");</script>
-	
-	       	     </div>
-	
-</div>
-<script type="text/javascript">$render("programmes-promotion","programmes-promotion");</script> 
-	
-<div class="bbccom_advert_placeholder">
-	<script type="text/javascript">$render("advert","advert-mpu-bottom");</script>
-</div>
-<script type="text/javascript">$render("advert-post-script-load");</script>
-
-	
-<div class="bbccom_advert_placeholder">
-	<script type="text/javascript">$render("advert","advert-google-adsense");</script>
-</div>
-<script type="text/javascript">$render("advert-post-script-load");</script>
-
-</div>
-<script type="text/javascript">$render("container-best","best");</script> 
-
-	<!-- END #MAIN-CONTENT & CPS_ASSET_TYPE CLASS: index -->
-	</div>
-<!-- END CPS_AUDIENCE CLASS: domestic -->
-	
-</div> 
-<div id="related-services" class="footer">
-   <div id="news-services">
-      <h2>Services</h2>  
-	   <ul>
-         <li id="service-mobile" class="first-child"><a href="http://www.bbc.co.uk/news/10628994"><span class="gvl3-mobile-icon-large services-icon">&nbsp;</span>Mobile</a></li>
-         <li id="service-feeds"><a href="http://www.bbc.co.uk/news/help-17655000"><span class="gvl3-connected-tv-icon-large services-icon">&nbsp;</span>Connected TV</a></li>
-         <li id="service-podcast"><a href="http://www.bbc.co.uk/news/10628494"><span class="gvl3-feeds-icon-large services-icon">&nbsp;</span>News feeds</a></li>
-         <li id="service-alerts"><a href="http://www.bbc.co.uk/news/10628323"><span class="gvl3-alerts-icon-large services-icon">&nbsp;</span>Alerts</a></li>
-         <li id="service-email-news"><a href="http://www.bbc.co.uk/news/help/16617948"><span class="gvl3-email-icon-large services-icon">&nbsp;</span>E-mail news</a></li>
-      </ul>	  
-   </div>
-   <div id="news-related-sites">
-      <h2><a href="http://www.bbc.co.uk/news/19888761">About BBC News</a></h2>
-      <ul>
-         <li class="column-1"><a href="http://www.bbc.co.uk/news/blogs/the_editors/">Editors' blog</a></li>
-         <li class="column-1"><a href="http://www.bbc.co.uk/journalism/">BBC College of Journalism</a></li>
-         <li class="column-1"><a href="http://www.bbc.co.uk/news/10621655">News sources</a></li>
-         <li class="column-1"><a href="http://www.bbc.co.uk/editorialguidelines/">Editorial Guidelines</a></li>
-      </ul>
-   </div>
-</div>
-</div><!-- close scotland -->
-
-
-
-	
-	
-
-   </div>   <!--[if IE 6]> <div id="blq-ie6-upgrade"> <p> <span>You're using the Internet Explorer 6 browser to view the BBC website. Our site will work much better if you change to a more modern browser. It's free, quick and easy.</span> <a href="http://www.browserchoice.eu/">Find out more <span>about upgrading your browser</span> here&hellip;</a> </p> </div> <![endif]-->  <div role="contentinfo" id="blq-foot" xml:lang="en-GB" class="blq-rst blq-clearfix blq-foot-transparent blq-foot-text-dark"> <div id="blq-footlinks"> <h2 class="blq-hide">BBC links</h2>       <ul>                    <li role="presentation" class="blq-footlinks-row"> <ul role="presentation" class="blq-footlinks-row-list"> <li><a href="http://www.bbc.co.uk/news/mobile/" id="blq-footer-mobile">Mobile site</a></li><li><a href="http://www.bbc.co.uk/terms/">Terms of Use</a></li><li><a href="http://www.bbc.co.uk/aboutthebbc/">About the BBC</a></li> </ul> </li>                <li role="presentation" class="blq-footlinks-row"> <ul role="presentation" class="blq-footlinks-row-list"> <li><a href="http://www.bbc.co.uk/privacy/">Privacy</a></li><li><a href="http://www.bbc.co.uk/accessibility/">Accessibility Help</a></li> </ul> </li>                <li role="presentation" class="blq-footlinks-row"> <ul role="presentation" class="blq-footlinks-row-list"> <li><a href="http://www.bbc.co.uk/privacy/bbc-cookies-policy.shtml">Cookies</a></li><li><a href="http://www.bbc.co.uk/news/20039682">Contact the BBC</a></li> </ul> </li>           <li role="presentation" class="blq-footlinks-row"> <ul role="presentation" class="blq-footlinks-row-list"> <li><a href="http://www.bbc.co.uk/guidance/">Parental Guidance</a></li> </ul> </li>             </ul> <script type="text/javascript">/*<![CDATA[*/ (function() { var mLink = document.getElementById('blq-footer-mobile'), stick = function() { var d = new Date (); d.setYear(d.getFullYear() + 1); d = d.toUTCString(); window.bbccookies.set('ckps_d=m;domain=.bbc.co.uk;path=/;expires=' + d ); window.bbccookies.set('ckps_d=m;domain=.bbc.com;path=/;expires=' + d ); }; if (mLink) {  if (mLink.addEventListener) { mLink.addEventListener('click', stick, false); } else if (mLink.attachEvent) { mLink.attachEvent('onclick', stick); } } })(); /*]]>*/</script>  </div>  <div id="blq-foot-blocks" class="blq-footer-image-dark"><img src="BBC_News_Scotland_files/dark.png" alt="BBC" height="24" width="84"></div>  <p id="blq-disclaim"><span id="blq-copy">BBC © 2014</span> <a href="http://www.bbc.co.uk/help/web/links/">The BBC is not responsible for the content of external sites. Read more.</a></p> <div id="blq-obit"><p><strong>This
- page is best viewed in an up-to-date web browser with style sheets 
-(CSS) enabled. While you will be able to view the content of this page 
-in your current browser, you will not be able to get the full visual 
-experience. Please consider upgrading your browser software or enabling 
-style sheets (CSS) if you are able to do so.</strong></p></div> </div> </div>  </div> </div>  <script type="text/javascript"> if (typeof require !== 'undefined') { require(['istats-1'], function(istats){ istats.track('external', { region: document.getElementById('blq-main') }); istats.track('download', { region: document.getElementById('blq-main') }); }); } </script>  <script type="text/html" id="blq-panel-template-promo"><![CDATA[ <div id="blq-panel-promo" class="blq-masthead-container"></div> ]]></script> <script type="text/html" id="blq-panel-template-more"><![CDATA[ <div id="blq-panel-more" class="blq-masthead-container  blq-clearfix" xml:lang="en-GB" dir="ltr"> <div class="blq-panel-container panel-paneltype-more"> <div class="panel-header"> <h2> <a href="http://www.bbc.co.uk/a-z/">  More&hellip;  </a> </h2>  <a href="http://www.bbc.co.uk/a-z/" class="panel-header-links panel-header-link">Full A-Z<span class="blq-hide"> of BBC sites</span></a>  </div> <div class="panel-component panel-links">       <ul>   <li> <a href="http://www.bbc.co.uk/cbbc/"  >CBBC</a> </li>    <li> <a href="http://www.bbc.co.uk/cbeebies/"  >CBeebies</a> </li>    <li> <a href="http://www.bbc.co.uk/comedy/"  >Comedy</a> </li>   </ul>  <ul>   <li> <a href="http://www.bbc.co.uk/food/"  >Food</a> </li>    <li> <a href="http://www.bbc.co.uk/history/"  >History</a> </li>    <li> <a href="http://www.bbc.co.uk/learning/"  >Learning</a> </li>   </ul>  <ul>   <li> <a href="http://www.bbc.co.uk/music/"  >Music</a> </li>    <li> <a href="http://www.bbc.co.uk/science/"  >Science</a> </li>    <li> <a href="http://www.bbc.co.uk/nature/"  >Nature</a> </li>   </ul>  <ul>   <li> <a href="http://www.bbc.co.uk/local/"  >Local</a> </li>    <li> <a href="http://www.bbc.co.uk/travelnews/"  >Travel News</a> </li>   </ul>   </div> </div> ]]></script>             <script type="text/javascript"> pulse.init( 'news-scotland', false ); </script> 
-	
-
-<!-- shared/foot -->
-<script type="text/javascript">
-	bbc.fmtj.common.removeNoScript({});
-	bbc.fmtj.common.tabs.createTabs({});
-</script>
-<!-- hi/news/foot.inc -->
-
-<!-- Chartbeat Web Analytics code - start -->
-<script type="text/javascript">
-var _sf_async_config={};
-/** CONFIGURATION START **/
-_sf_async_config.uid = 50924; /** Chartbeat BBC id **/
-_sf_async_config.domain = "bbc.co.uk";/** BBC domain being tracked **/
-_sf_async_config.sections = "scotland";
-_sf_async_config.region = "domestic";
-
-  
-
-/** CONFIGURATION END **/
-(function(){
-  function loadChartbeat() {
-    window._sf_endpt=(new Date()).getTime();
-    var e = document.createElement("script");
-    e.setAttribute("language", "javascript");
-    e.setAttribute("type", "text/javascript");
-    e.setAttribute('src', '/inc/specials/cream/hi/news/chartbeat/chartbeat.js');
-    document.body.appendChild(e);
-  }
-  var oldonload = window.onload;
-  window.onload = (typeof window.onload != "function") ?
-     loadChartbeat : function() { oldonload(); loadChartbeat(); };
-})();
-</script>
-<!-- Chartbeat Web Analytics code - end -->
-<!-- shared/foot_index -->
-<!-- #CREAM hi news domestic foot.inc -->
-
- 
-
-              
-  
-
-
-	
-    		
-		
-	
-    	
-   
-
-<!-- CPS COMMENT STATUS: false -->
-
-
-<script type="text/javascript" src="BBC_News_Scotland_files/a"></script>
-
-
-
-
-<div class="glow177-cssTest" style="height:0;position:absolute;visibility:hidden;top:-20px;display:block"></div><script src="BBC_News_Scotland_files/chartbeat.js" type="text/javascript" language="javascript"></script></body></html>
\ No newline at end of file
diff --git a/src/plugin/any23/sample/microdata_basic.html b/src/plugin/any23/sample/microdata_basic.html
deleted file mode 100644
index 3ffca84251..0000000000
--- a/src/plugin/any23/sample/microdata_basic.html
+++ /dev/null
@@ -1,107 +0,0 @@
-<!DOCTYPE html>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<html>
-<head>
-<body>
-
-<!--  result0 -->
-<div itemscope>
-    <p>My name is <span itemprop="name">Elizabeth</span>.</p>
-</div>
-<!--  result1 -->
-<div itemscope>
-    <p>My name is <span itemprop="name">Daniel</span>.</p>
-</div>
-
-<!--  result2 -->
-<div itemscope>
-    <p>My name is <span itemprop="name">Neil</span>.</p>
-    <p>My band is called <span itemprop="band">Four Parts Water</span>.</p>
-    <p>I am <span itemprop="nationality">British</span>.</p>
-</div>
-
-<!--  result3 -->
-<div itemscope>
-    <img itemprop="image" src="google-logo.png" alt="Google">
-</div>
-
-<!--  result4 -->
-<div itemscope>
- I was born on <time itemprop="birthday" datetime="2009-05-10">May 10th 2009</time>.
-</div>
-
-<!--  result5 -->
-<div itemscope>
-    <p>Flavors in my favorite ice cream:</p>
-    <ul>
-        <li itemprop="flavor">Lemon sorbet</li>
-        <li itemprop="flavor">Apricot sorbet</li>
-    </ul>
-</div>
-
-<!--  result6 -->
-<div itemscope>
-    <span itemprop="favorite-color favorite-fruit">orange</span>
-</div>
-
-<!--  result7 -->
-<figure>
-    <img src="castle.jpeg">
-    <figcaption><span itemscope><span itemprop="name">The Castle</span></span> (1986)</figcaption>
-</figure>
-
-<!--  result8 -->
-<span itemscope><meta itemprop="name" content="The Castle"></span>
-<figure>
-    <img src="castle.jpeg">
-    <figcaption>The Castle (1986)</figcaption>
-</figure>
-
-<!--  result9 -->
-<section itemscope itemtype="http://example.org/animals#cat">
-    <h1 itemprop="name">Hedral</h1>
-    <p itemprop="desc">Hedral is a male american domestic shorthair,
-     with a fluffy black fur with white paws and belly.</p>
-    <img itemprop="img" src="hedral.jpeg" alt="" title="Hedral, age 18 months">
-</section>
-
-<!--  result10 -->
-<dl itemscope
-    itemtype="http://vocab.example.net/book"
-    itemid="urn:isbn:0-330-34032-8">
-    <dt>Title
-    <dd itemprop="title">The Reality Dysfunction
-    <dt>Author
-    <dd itemprop="author">Peter F. Hamilton
-    <dt>Publication date
-    <dd>
-    <time itemprop="pubdate" datetime="1996-01-26">26 January 1996</time>
-</dl>
-
-<!--  result11 -->
-<section itemscope itemtype="http://example.org/animals#cat">
-    <h1 itemprop="name http://example.com/fn">Hedral</h1>
-    <p itemprop="desc">Hedral is a male american domestic shorthair, with a fluffy
-        <span itemprop="http://example.com/color">black</span> fur with
-        <span itemprop="http://example.com/color">white</span> paws and belly.</p>
-    <img itemprop="img" src="hedral.jpeg" alt="" title="Hedral, age 18 months">
-</section>
-
-</body>
-</head>
-</html>
\ No newline at end of file
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
deleted file mode 100644
index 09dc32e02d..0000000000
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.any23;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * <p>This implementation of {@link org.apache.nutch.indexer.IndexingFilter}
- * adds a <i>triple(s)</i> field to the {@link org.apache.nutch.indexer.NutchDocument}.</p>
- * <p>Triples are extracted via <a href="https://any23.apache.org/">Apache Any23</a>.</p>
- * @see org.apache.nutch.any23.Any23ParseFilter
- */
-public class Any23IndexingFilter implements IndexingFilter {
-
-  /** Logging instance */
-  private static final Logger LOG = LoggerFactory.getLogger(Any23IndexingFilter.class);
-  
-  public static final String STRUCTURED_DATA = "structured_data";
-
-  private Configuration conf;
-
-  /**
-   * Get the {@link Configuration} object
-   * @see org.apache.hadoop.conf.Configurable#getConf()
-   */
-  @Override
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Set the {@link Configuration} object
-   * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
-   */
-  @Override
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  /**
-   *
-   * @param doc
-   *          document instance for collecting fields
-   * @param parse
-   *          parse data instance
-   * @param url
-   *          page url
-   * @param datum
-   *          crawl datum for the page (fetch datum from segment containing
-   *          fetch status and fetch time)
-   * @param inlinks
-   *          page inlinks
-   * @return filtered NutchDocument
-   * @see org.apache.nutch.indexer.IndexingFilter#filter(NutchDocument, Parse, Text, CrawlDatum, Inlinks)
-   *
-   * @throws IndexingException if there is a fatl error whilst indexing
-   */
-  @Override
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-    String[] metadata = parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
-
-    if (metadata != null) {
-      for (String triple : metadata) {
-        Pattern pattern = Pattern.compile("^([^ ]+) ([^ ]+) (.+) \\.");
-        Matcher matcher = pattern.matcher(triple);
-        if (matcher.find()) {
-          Map<String, String> map = new HashMap<>();
-          map.put("node", matcher.group(1));
-          map.put("key", matcher.group(2));
-          map.put("short_key", keyToShortKey(matcher.group(2)));
-          map.put("value", matcher.group(3));
-          doc.add("structured_data", map);
-        } else {
-          LOG.warn("Unsupported triple format " + triple);
-        }
-      }
-    }
-    return doc;
-  }
-  
-  private static String keyToShortKey(String key) {
-    if (key.startsWith("<") && key.endsWith(">")) {
-      key = key.substring(1, key.length() - 1);
-    }
-    String[] keyParts = key.split("/");
-    String[] keySubParts = keyParts[keyParts.length - 1].split("#");
-    return keySubParts[keySubParts.length - 1];
-  }
-}
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
deleted file mode 100644
index bed659f352..0000000000
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.any23;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.net.URISyntaxException;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.apache.any23.Any23;
-import org.apache.any23.extractor.ExtractionException;
-import org.apache.any23.filter.IgnoreAccidentalRDFa;
-import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
-import org.apache.any23.mime.TikaMIMETypeDetector;
-import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
-import org.apache.any23.writer.BenchmarkTripleHandler;
-import org.apache.any23.writer.NTriplesWriter;
-import org.apache.any23.writer.TripleHandler;
-import org.apache.any23.writer.TripleHandlerException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.DocumentFragment;
-
-/**
- * <p>This implementation of {@link org.apache.nutch.parse.HtmlParseFilter}
- * uses the <a href="https://any23.apache.org/">Apache Any23</a> library
- * for parsing and extracting structured data in RDF format from a
- * variety of Web documents. The supported formats can be found at <a href="https://any23.apache.org/">Apache Any23</a>.
- * <p>In this implementation triples are written as <a href="https://www.w3.org/TeamSubmission/n3/">Notation3</a> 
- * and triples are identified within output triple streams by the presence of '\n'.
- * The presence of the '\n' is a characteristic specific to N3 serialization in Any23.
- * In order to use another/other writers implementing the
- * <a href="https://any23.apache.org/apidocs/index.html?org/apache/any23/writer/TripleHandler.html">TripleHandler</a>
- * interface, we will most likely need to identify an alternative data characteristic
- * which we can use to split triples streams.</p>
- */
-public class Any23ParseFilter implements HtmlParseFilter {
-
-  /** Logging instance */
-  private static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class);
-
-  private Configuration conf = null;
-
-  /**
-   * Constant identifier used as a Key for writing and reading
-   * triples to and from the metadata Map field.
-   */
-  public static final String ANY23_TRIPLES = "Any23-Triples";
-
-  public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
-  public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";
-
-  private static class Any23Parser {
-
-    Set<String> triples = null;
-
-    Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
-      this.triples = new TreeSet<>();
-      try {
-        parse(url, htmlContent, contentType, extractorNames);
-      } catch (URISyntaxException e) {
-        LOG.error("Error parsing URI: {}", url, e);
-        throw new RuntimeException(e.getReason());
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
-    }
-
-    /**
-     * Maintains a {@link java.util.Set} containing the triples
-     * @return a {@link java.util.Set} of triples.
-     */
-    Set<String> getTriples() {
-      return this.triples;
-    }
-
-    private void parse(String url, String htmlContent, String contentType, String... extractorNames)
-            throws URISyntaxException, IOException, TripleHandlerException {
-      Any23 any23 = new Any23(extractorNames);
-      any23.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()));
-      ByteArrayOutputStream baos = new ByteArrayOutputStream();
-      try (TripleHandler tHandler = new IgnoreTitlesOfEmptyDocuments(
-              new IgnoreAccidentalRDFa(
-                      new NTriplesWriter(baos))); 
-              BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler)) {
-        try {
-          any23.extract(htmlContent, url, contentType, "UTF-8", bHandler);
-        } catch (IOException e) {
-          LOG.error("Error while reading the source", e);
-        } catch (ExtractionException e) {
-          LOG.error("Error while extracting structured data", e);
-        }
-
-        LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report());
-
-        String n3 = baos.toString("UTF-8");
-        String[] triplesStrings = n3.split("\n");
-        Collections.addAll(this.triples, triplesStrings);
-      } catch (IOException e) {
-        LOG.error("Unexpected IOException", e);
-      }
-    }
-  }
-
-  @Override
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  @Override
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  /**
-   * @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment)
-   */
-  @Override
-  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
-    String[] extractorNames = this.conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
-    String[] supportedContentTypes = this.conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
-    String contentType = content.getContentType();
-    if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
-      LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
-      return parseResult;
-    }
-
-    Any23Parser parser;
-    try {
-      String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
-      parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
-    } catch (TripleHandlerException e) {
-      throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
-    }
-    Set<String> triples = parser.getTriples();
-
-    Parse parse = parseResult.get(content.getUrl());
-    Metadata metadata = parse.getData().getParseMeta();
-
-    for (String triple : triples) {
-      metadata.add(ANY23_TRIPLES, triple);
-    }
-
-    return parseResult;
-  }
-}
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java b/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java
deleted file mode 100644
index 47010768c6..0000000000
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * This packages uses the <a href="https://any23.apache.org/">Apache Any23</a> library
- * for parsing and extracting structured data in RDF format from a
- * variety of Web documents. The supported formats can be found
- * at <a href="https://any23.apache.org/">Apache Any23</a>.
- */
-package org.apache.nutch.any23;
diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java
deleted file mode 100644
index 1367e19c46..0000000000
--- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.any23;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestAny23IndexingFilter {
-  @Test
-  public void testAny23TriplesFields() throws Exception {
-    Configuration conf = NutchConfiguration.create();
-    Any23IndexingFilter filter = new Any23IndexingFilter();
-    filter.setConf(conf);
-    Assert.assertNotNull(filter);
-    NutchDocument doc = new NutchDocument();
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "The Foo Page",
-        new Outlink[] { }, new Metadata());
-    ParseImpl parse = new ParseImpl("test page", parseData);
-    String[] triples = new String[]{
-        "<http://dbpedia.org/resource/Z\u00FCrich> <http://www.w3.org/2002/07/owl#sameAs> <http://rdf.freebase.com/ns/m.08966> .",
-        "<http://dbpedia.org/resource/Z\u00FCrich> <http://dbpedia.org/property/yearHumidity> \"77\" .",
-        "<http://dbpedia.org/resource/Z\u00FCrich> <http://www.w3.org/2000/01/rdf-schema#label> \"Zurique\"@pt ."
-    };
-    for (String triple : triples) {      
-      parse.getData().getParseMeta().add(Any23ParseFilter.ANY23_TRIPLES, triple);
-    }    
-    try {
-      doc = filter.filter(doc, parse, new Text("http://nutch.apache.org/"), new CrawlDatum(), new Inlinks());
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.getMessage());
-    }
-    List<Object> docTriples = doc.getField(Any23IndexingFilter.STRUCTURED_DATA).getValues();
-    Assert.assertEquals(docTriples.size(), triples.length);
-
-    Object triple = docTriples.get(0);
-    Assert.assertTrue(triple instanceof Map<?, ?>);
-    @SuppressWarnings("unchecked")
-    Map<String, String> structuredData = (Map<String, String>) triple;
-    Assert.assertEquals(structuredData.get("node"), "<http://dbpedia.org/resource/Z\u00FCrich>");
-    Assert.assertEquals(structuredData.get("key"), "<http://www.w3.org/2002/07/owl#sameAs>");
-    Assert.assertEquals(structuredData.get("short_key"), "sameAs");
-    Assert.assertEquals(structuredData.get("value"), "<http://rdf.freebase.com/ns/m.08966>");
-    
-    triple = docTriples.get(1);
-    Assert.assertTrue(triple instanceof Map<?, ?>);
-    structuredData = (Map<String, String>) triple;
-    Assert.assertEquals(structuredData.get("node"), "<http://dbpedia.org/resource/Z\u00FCrich>");
-    Assert.assertEquals(structuredData.get("key"), "<http://dbpedia.org/property/yearHumidity>");
-    Assert.assertEquals(structuredData.get("short_key"), "yearHumidity");
-    Assert.assertEquals(structuredData.get("value"), "\"77\"");
-  }
-}
diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
deleted file mode 100644
index 09c253fbc5..0000000000
--- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.any23;
-
-import java.io.File;
-import java.io.IOException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParserNotFound;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestAny23ParseFilter {
-
-
-  private Configuration conf;
-
-  private String fileSeparator = System.getProperty("file.separator");
-
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/any23/build.xml during plugin compilation.
-  private String file1 = "BBC_News_Scotland.html";
-  
-  private String file2 = "microdata_basic.html";
-
-  private static final int EXPECTED_TRIPLES_1 = 79;
-  
-  private static final int EXPECTED_TRIPLES_2 = 40;
-  
-  @Before
-  public void setUp() {
-    this.conf = NutchConfiguration.create();
-    conf.set("file.content.limit", "-1");
-    conf.set("parser.timeout", "-1");
-    conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links,"
-            + "html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,"
-            + "html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,"
-            + "html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
-    conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html");
-  }
-
-  @Test
-  public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException {
-    String[] triplesArray = getTriples(file1);
-    
-    Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter", 
-        EXPECTED_TRIPLES_1, triplesArray.length);
-  }
-
-  @Test
-  public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException {
-    String[] triplesArray = getTriples(file2);
-    
-    Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter", 
-        EXPECTED_TRIPLES_2, triplesArray.length);
-  }
-
-  @Test
-  public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException {
-    String[] triplesArray = getTriples(file1, "application/pdf");
-
-    Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored",
-            0, triplesArray.length);
-  }
-  
-  public String[] extract(String urlString, File file, String contentType) {
-    try {
-      System.out.println(urlString);
-      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      Content content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      content.setContentType(contentType);
-      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-      return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.toString());
-    }
-    return null;
-  }
-
-  private String[] getTriples(String fileName) {
-    return getTriples(fileName, "text/html");
-  }
-
-  private String[] getTriples(String fileName, String contentType) {
-    String urlString = "file:" + sampleDir + fileSeparator + fileName;
-
-    File file = new File(sampleDir + fileSeparator + fileName);
-
-    return extract(urlString, file, contentType);
-  }
-}
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index e83f252734..34688ed566 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -35,7 +35,6 @@
     <ant dir="lib-selenium" target="deploy"/>
     <ant dir="lib-xml" target="deploy"/>
     <!-- real plugins -->
-    <ant dir="any23" target="deploy"/>
     <ant dir="creativecommons" target="deploy"/>
     <ant dir="feed" target="deploy"/>
     <ant dir="headings" target="deploy"/>
@@ -114,7 +113,6 @@
   <!-- ====================================================== -->
   <target name="test">
     <parallel threadCount="2">
-     <ant dir="any23" target="test"/>
      <ant dir="creativecommons" target="test"/>
      <ant dir="feed" target="test"/>
      <ant dir="headings" target="test"/>
@@ -176,7 +174,6 @@
   <!-- Clean all of the plugins.                              -->
   <!-- ====================================================== -->
   <target name="clean">
-    <ant dir="any23" target="clean"/>
     <ant dir="creativecommons" target="clean"/>
     <ant dir="feed" target="clean"/>
     <ant dir="headings" target="clean"/>
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
index cb3ed6be87..2dcf76c455 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -33,8 +33,6 @@
 
     $ cd ../language-identifier/
 
-It should be noted that Any23 also has a dependency on Tika so you may wish to check that there are no classpath conflicts in the any23 plugin as well.
-
 7. Build Nutch and run all unit tests:
 
     $ cd ../../../

From 10f7c0c5823ae4b7867c89339acec64fec277058 Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Thu, 14 Sep 2023 13:50:58 -0400
Subject: [PATCH 09/28] NUTCH-2959 -- bump Tika to 2.9.0

---
 ivy/ivy.xml                               |   2 +-
 src/plugin/language-identifier/ivy.xml    |   2 +-
 src/plugin/language-identifier/plugin.xml |   8 +-
 src/plugin/parse-tika/ivy.xml             |   3 +-
 src/plugin/parse-tika/plugin.xml          | 123 +++++++++++-----------
 5 files changed, 71 insertions(+), 67 deletions(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 18a6df2302..484da135b8 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -56,7 +56,7 @@
 		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.3.4" conf="*->default" />
 		<!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.apache.tika" name="tika-core" rev="2.3.0" />
+		<dependency org="org.apache.tika" name="tika-core" rev="2.9.0" />
 
 		<dependency org="xml-apis" name="xml-apis" rev="1.4.01" /><!-- force this version as it is required by Tika -->
 		<dependency org="xerces" name="xercesImpl" rev="2.12.2" />
diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml
index 395047c6fc..5c357a75b6 100644
--- a/src/plugin/language-identifier/ivy.xml
+++ b/src/plugin/language-identifier/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-langdetect-optimaize" rev="2.3.0" conf="*->default">
+    <dependency org="org.apache.tika" name="tika-langdetect-optimaize" rev="2.9.0" conf="*->default">
       <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="com.google.guava" name="guava" />
diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml
index 357c4a67cd..28cfd70317 100644
--- a/src/plugin/language-identifier/plugin.xml
+++ b/src/plugin/language-identifier/plugin.xml
@@ -27,15 +27,15 @@
       </library>
       <!-- dependencies of Tika's Optimaize language detector (tika-langdetect-optimaize) -->
       <library name="annotations-12.0.jar"/>
-      <library name="checker-qual-3.12.0.jar"/>
-      <library name="error_prone_annotations-2.7.1.jar"/>
+      <library name="checker-qual-3.33.0.jar"/>
+      <library name="error_prone_annotations-2.18.0.jar"/>
       <library name="failureaccess-1.0.1.jar"/>
-      <library name="j2objc-annotations-1.3.jar"/>
+      <library name="j2objc-annotations-2.8.jar"/>
       <library name="jsonic-1.2.11.jar"/>
       <library name="jsr305-3.0.2.jar"/>
       <library name="language-detector-0.6.jar"/>
       <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
-      <library name="tika-langdetect-optimaize-2.3.0.jar"/>
+      <library name="tika-langdetect-optimaize-2.9.0.jar"/>
     </runtime>
 
    <requires>
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index f0ec7a8d8c..d4718ea498 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers-standard-package" rev="2.3.0" conf="*->default">
+    <dependency org="org.apache.tika" name="tika-parsers-standard-package" rev="2.9.0" conf="*->default">
       <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="org.apache.commons" name="commons-lang3" />
@@ -53,6 +53,7 @@
       <!--exclude org="commons-codec" name="commons-codec" /-->
       <!--exclude org="org.apache.commons" name="commons-compress" /-->
     </dependency>
+    <dependency org="org.apache.tika" name="tika-parser-html-commons" rev="2.9.0" conf="*->default"/>
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index d88405bc1c..3bc0a822cb 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -26,84 +26,87 @@
          <export name="*"/>
       </library>
       <!-- dependencies of Tika (tika-parsers) -->
-      <library name="apache-mime4j-core-0.8.4.jar"/>
-      <library name="apache-mime4j-dom-0.8.4.jar"/>
-      <library name="asm-9.2.jar"/>
-      <library name="bcmail-jdk15on-1.70.jar"/>
-      <library name="bcpkix-jdk15on-1.70.jar"/>
-      <library name="bcprov-jdk15on-1.70.jar"/>
-      <library name="bcutil-jdk15on-1.70.jar"/>
+      <library name="apache-mime4j-core-0.8.9.jar"/>
+      <library name="apache-mime4j-dom-0.8.9.jar"/>
+      <library name="asm-9.5.jar"/>
+      <library name="bcmail-jdk18on-1.76.jar"/>
+      <library name="bcpkix-jdk18on-1.76.jar"/>
+      <library name="bcprov-jdk18on-1.76.jar"/>
+      <library name="bcutil-jdk18on-1.76.jar"/>
       <library name="boilerpipe-1.1.0.jar"/>
-      <library name="commons-codec-1.15.jar"/>
-      <library name="commons-compress-1.21.jar"/>
-      <library name="commons-csv-1.9.0.jar"/>
+      <library name="commons-codec-1.16.0.jar"/>
+      <library name="commons-compress-1.23.0.jar"/>
+      <library name="commons-csv-1.10.0.jar"/>
       <library name="commons-exec-1.3.jar"/>
       <library name="commons-math3-3.6.1.jar"/>
-      <library name="curvesapi-1.06.jar"/>
-      <library name="dd-plist-1.23.jar"/>
+      <library name="curvesapi-1.07.jar"/>
+      <library name="dd-plist-1.27.jar"/>
       <library name="dec-0.1.2.jar"/>
-      <library name="fontbox-2.0.25.jar"/>
+      <library name="fontbox-2.0.29.jar"/>
       <library name="istack-commons-runtime-3.0.12.jar"/>
-      <library name="jackcess-4.0.1.jar"/>
-      <library name="jackcess-encrypt-4.0.1.jar"/>
+      <library name="jackcess-4.0.5.jar"/>
+      <library name="jackcess-encrypt-4.0.2.jar"/>
+      <library name="jackson-annotations-2.15.2.jar"/>
+      <library name="jackson-core-2.15.2.jar"/>
+      <library name="jackson-databind-2.15.2.jar"/>
       <library name="jai-imageio-core-1.4.0.jar"/>
       <library name="jakarta.activation-1.2.2.jar"/>
       <library name="jakarta.xml.bind-api-2.3.3.jar"/>
       <library name="java-libpst-0.9.3.jar"/>
-      <library name="jaxb-runtime-2.3.5.jar"/>
-      <library name="jbig2-imageio-3.0.3.jar"/>
-      <library name="jcl-over-slf4j-1.7.35.jar"/>
+      <library name="jaxb-runtime-2.3.6.jar"/>
+      <library name="jbig2-imageio-3.0.4.jar"/>
+      <library name="jcl-over-slf4j-2.0.7.jar"/>
       <library name="jdom2-2.0.6.1.jar"/>
-      <library name="jempbox-1.8.16.jar"/>
-      <library name="jhighlight-1.0.3.jar"/>
+      <library name="jempbox-1.8.17.jar"/>
+      <library name="jhighlight-1.1.0.jar"/>
       <library name="jmatio-1.5.jar"/>
-      <library name="juniversalchardet-1.0.3.jar"/>
-      <library name="junrar-7.4.1.jar"/>
-      <library name="log4j-api-2.17.1.jar"/>
-      <library name="metadata-extractor-2.16.0.jar"/>
+      <library name="juniversalchardet-2.4.0.jar"/>
+      <library name="junrar-7.5.5.jar"/>
+      <library name="jwarc-0.28.1.jar"/>
+      <library name="log4j-api-2.20.0.jar"/>
+      <library name="metadata-extractor-2.18.0.jar"/>
       <library name="parso-2.0.14.jar"/>
-      <library name="pdfbox-2.0.25.jar"/>
-      <library name="pdfbox-debugger-2.0.25.jar"/>
-      <library name="pdfbox-tools-2.0.25.jar"/>
-      <library name="poi-5.2.0.jar"/>
-      <library name="poi-ooxml-5.2.0.jar"/>
-      <library name="poi-ooxml-lite-5.2.0.jar"/>
-      <library name="poi-scratchpad-5.2.0.jar"/>
-      <library name="rome-1.18.0.jar"/>
-      <library name="rome-utils-1.18.0.jar"/>
+      <library name="pdfbox-2.0.29.jar"/>
+      <library name="pdfbox-tools-2.0.29.jar"/>
+      <library name="poi-5.2.3.jar"/>
+      <library name="poi-ooxml-5.2.3.jar"/>
+      <library name="poi-ooxml-lite-5.2.3.jar"/>
+      <library name="poi-scratchpad-5.2.3.jar"/>
+      <library name="rome-2.1.0.jar"/>
+      <library name="rome-utils-2.1.0.jar"/>
       <library name="SparseBitSet-1.2.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parser-apple-module-2.3.0.jar"/>
-      <library name="tika-parser-audiovideo-module-2.3.0.jar"/>
-      <library name="tika-parser-cad-module-2.3.0.jar"/>
-      <library name="tika-parser-code-module-2.3.0.jar"/>
-      <library name="tika-parser-crypto-module-2.3.0.jar"/>
-      <library name="tika-parser-digest-commons-2.3.0.jar"/>
-      <library name="tika-parser-font-module-2.3.0.jar"/>
-      <library name="tika-parser-html-commons-2.3.0.jar"/>
-      <library name="tika-parser-html-module-2.3.0.jar"/>
-      <library name="tika-parser-image-module-2.3.0.jar"/>
-      <library name="tika-parser-mail-commons-2.3.0.jar"/>
-      <library name="tika-parser-mail-module-2.3.0.jar"/>
-      <library name="tika-parser-microsoft-module-2.3.0.jar"/>
-      <library name="tika-parser-miscoffice-module-2.3.0.jar"/>
-      <library name="tika-parser-news-module-2.3.0.jar"/>
-      <library name="tika-parser-ocr-module-2.3.0.jar"/>
-      <library name="tika-parser-pdf-module-2.3.0.jar"/>
-      <library name="tika-parser-pkg-module-2.3.0.jar"/>
-      <library name="tika-parsers-standard-package-2.3.0.jar"/>
-      <library name="tika-parser-text-module-2.3.0.jar"/>
-      <library name="tika-parser-xml-module-2.3.0.jar"/>
-      <library name="tika-parser-xmp-commons-2.3.0.jar"/>
-      <library name="tika-parser-zip-commons-2.3.0.jar"/>
-      <library name="txw2-2.3.5.jar"/>
+      <library name="tika-parser-apple-module-2.9.0.jar"/>
+      <library name="tika-parser-audiovideo-module-2.9.0.jar"/>
+      <library name="tika-parser-cad-module-2.9.0.jar"/>
+      <library name="tika-parser-code-module-2.9.0.jar"/>
+      <library name="tika-parser-crypto-module-2.9.0.jar"/>
+      <library name="tika-parser-digest-commons-2.9.0.jar"/>
+      <library name="tika-parser-font-module-2.9.0.jar"/>
+      <library name="tika-parser-html-commons-2.9.0.jar"/>
+      <library name="tika-parser-html-module-2.9.0.jar"/>
+      <library name="tika-parser-image-module-2.9.0.jar"/>
+      <library name="tika-parser-mail-commons-2.9.0.jar"/>
+      <library name="tika-parser-mail-module-2.9.0.jar"/>
+      <library name="tika-parser-microsoft-module-2.9.0.jar"/>
+      <library name="tika-parser-miscoffice-module-2.9.0.jar"/>
+      <library name="tika-parser-news-module-2.9.0.jar"/>
+      <library name="tika-parser-ocr-module-2.9.0.jar"/>
+      <library name="tika-parser-pdf-module-2.9.0.jar"/>
+      <library name="tika-parser-pkg-module-2.9.0.jar"/>
+      <library name="tika-parsers-standard-package-2.9.0.jar"/>
+      <library name="tika-parser-text-module-2.9.0.jar"/>
+      <library name="tika-parser-webarchive-module-2.9.0.jar"/>
+      <library name="tika-parser-xml-module-2.9.0.jar"/>
+      <library name="tika-parser-xmp-commons-2.9.0.jar"/>
+      <library name="tika-parser-zip-commons-2.9.0.jar"/>
+      <library name="txw2-2.3.6.jar"/>
       <library name="vorbis-java-core-0.8.jar"/>
       <library name="vorbis-java-tika-0.8.jar"/>
-      <library name="xmlbeans-5.0.3.jar"/>
-      <library name="xmpbox-2.0.25.jar"/>
+      <library name="xmlbeans-5.1.1.jar"/>
+      <library name="xmpbox-2.0.29.jar"/>
       <library name="xmpcore-6.1.11.jar"/>
       <library name="xz-1.9.jar"/>
-
       <!-- end of dependencies of Tika (tika-parsers) -->
    </runtime>
 

From 51055ef47ac09e082ae74bfa2720a84af431da19 Mon Sep 17 00:00:00 2001
From: tballison <tallison@apache.org>
Date: Thu, 14 Sep 2023 14:42:01 -0400
Subject: [PATCH 10/28] NUTCH-2978 -- update slf4j-api

---
 ivy/ivy.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 9e19cec33d..d2f86ded70 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -39,7 +39,7 @@
 		<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.20.0" conf="*->master" />
     <dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.20.0" conf="*->master" />
 		<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.20.0" conf="*->master" />
-		<dependency org="org.slf4j" name="slf4j-api" rev="1.7.36" conf="*->master" />
+		<dependency org="org.slf4j" name="slf4j-api" rev="2.0.7" conf="*->master" />
 
 		<dependency org="org.apache.commons" name="commons-lang3" rev="3.12.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />

From 6bfeaf4e2b9042ac3d9787fbbf558b16310c099a Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Mon, 18 Sep 2023 15:07:40 -0400
Subject: [PATCH 11/28] NUTCH-2959 -- bump Tika to 2.9.0, bump common
 dependencies throughout

---
 ivy/ivy.xml                               | 14 +++++++-------
 src/plugin/indexer-cloudsearch/plugin.xml |  6 +++---
 src/plugin/indexer-kafka/plugin.xml       |  6 +++---
 src/plugin/indexer-solr/plugin.xml        |  2 +-
 src/plugin/lib-htmlunit/plugin.xml        |  6 +++---
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 9aceed2c13..ce6ee002bb 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -41,11 +41,11 @@
 		<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.20.0" conf="*->master" />
 		<dependency org="org.slf4j" name="slf4j-api" rev="2.0.7" conf="*->master" />
 
-		<dependency org="org.apache.commons" name="commons-lang3" rev="3.12.0" conf="*->default" />
+		<dependency org="org.apache.commons" name="commons-lang3" rev="3.13.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />
 		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.13" conf="*->master" />
-		<dependency org="commons-codec" name="commons-codec" rev="1.15" conf="*->default" />
-		<dependency org="org.apache.commons" name="commons-compress" rev="1.21" conf="*->default" />
+		<dependency org="commons-codec" name="commons-codec" rev="1.16.0" conf="*->default" />
+		<dependency org="org.apache.commons" name="commons-compress" rev="1.23.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-jexl3" rev="3.2.1" conf="*->default" />
 		<dependency org="com.tdunning" name="t-digest" rev="3.3" />
 
@@ -88,10 +88,10 @@
 		<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.5.3" conf="*->default" />
 		<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.5.3" conf="*->default" />
 		<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.5.3" conf="test->default" />
-		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.13.3" conf="*->default" />
-		<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.13.3" conf="*->default" />
-		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.13.3" conf="*->default" />
-		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.13.3" conf="*->default" />
+		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.15.2" conf="*->default" />
+		<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.15.2" conf="*->default" />
+		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.15.2" conf="*->default" />
+		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.15.2" conf="*->default" />
 
 		<!-- WARC artifacts needed -->
 		<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.9" conf="*->default">
diff --git a/src/plugin/indexer-cloudsearch/plugin.xml b/src/plugin/indexer-cloudsearch/plugin.xml
index 5b4425359a..f18bc49eab 100644
--- a/src/plugin/indexer-cloudsearch/plugin.xml
+++ b/src/plugin/indexer-cloudsearch/plugin.xml
@@ -29,9 +29,9 @@
      <library name="commons-logging-1.1.3.jar"/>
      <library name="httpclient-4.3.6.jar"/>
      <library name="httpcore-4.3.3.jar"/>
-     <library name="jackson-annotations-2.5.0.jar"/>
-     <library name="jackson-core-2.5.3.jar"/>
-     <library name="jackson-databind-2.5.3.jar"/>
+     <library name="jackson-annotations-2.15.2.jar"/>
+     <library name="jackson-core-2.15.2.jar"/>
+     <library name="jackson-databind-2.15.2.jar"/>
      <library name="joda-time-2.8.jar"/>
 
   </runtime>
diff --git a/src/plugin/indexer-kafka/plugin.xml b/src/plugin/indexer-kafka/plugin.xml
index c5cc21c01c..e49b6d4c30 100644
--- a/src/plugin/indexer-kafka/plugin.xml
+++ b/src/plugin/indexer-kafka/plugin.xml
@@ -25,9 +25,9 @@
         <library name="kafka_2.12-1.1.0.jar"/>
         <library name="connect-json-1.1.0.jar"/>
         <library name="connect-api-1.1.0.jar"/>
-        <library name="jackson-annotations-2.9.0.jar"/>
-        <library name="jackson-core-2.9.4.jar"/>
-        <library name="jackson-databind-2.9.4.jar"/>
+        <library name="jackson-annotations-2.15.2.jar"/>
+        <library name="jackson-core-2.15.2.jar"/>
+        <library name="jackson-databind-2.15.2.jar"/>
         <library name="jopt-simple-5.0.4.jar"/>
         <library name="kafka-clients-1.1.0.jar"/>
         <library name="lz4-java-1.4.jar"/>
diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml
index d49641cf9c..f672ac9ed0 100644
--- a/src/plugin/indexer-solr/plugin.xml
+++ b/src/plugin/indexer-solr/plugin.xml
@@ -17,7 +17,7 @@
 			<export name="*" />
 		</library>
 		<!-- Solr dependencies -->
-		<library name="commons-io-2.8.0.jar"/>
+		<library name="commons-io-2.13.0.jar"/>
 		<library name="commons-lang-2.6.jar"/>
 		<library name="commons-math3-3.6.1.jar"/>
 		<library name="http2-client-9.4.44.v20210927.jar"/>
diff --git a/src/plugin/lib-htmlunit/plugin.xml b/src/plugin/lib-htmlunit/plugin.xml
index 95caaa3201..6f14209af4 100644
--- a/src/plugin/lib-htmlunit/plugin.xml
+++ b/src/plugin/lib-htmlunit/plugin.xml
@@ -50,16 +50,16 @@
      <library name="checker-qual-3.12.0.jar">
        <export name="*"/>
      </library>
-     <library name="commons-codec-1.15.jar">
+     <library name="commons-codec-1.16.0.jar">
        <export name="*"/>
      </library>
      <library name="commons-exec-1.3.jar">
        <export name="*"/>
      </library>
-     <library name="commons-io-2.10.0.jar">
+     <library name="commons-io-2.13.0.jar">
        <export name="*"/>
      </library>
-     <library name="commons-lang3-3.12.0.jar">
+     <library name="commons-lang3-3.13.0.jar">
        <export name="*"/>
      </library>
      <library name="commons-logging-1.2.jar">

From f11d383b62ea4dd7ff988a9962f1d5ccb3c82d10 Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Tue, 19 Sep 2023 09:38:17 -0400
Subject: [PATCH 12/28] NUTCH-2959 -- bump commons-io

---
 ivy/ivy.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index ce6ee002bb..2ef3599ab3 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -45,6 +45,7 @@
 		<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />
 		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.13" conf="*->master" />
 		<dependency org="commons-codec" name="commons-codec" rev="1.16.0" conf="*->default" />
+		<dependency org="commons-io" name="commons-io" rev="2.13.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-compress" rev="1.23.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-jexl3" rev="3.2.1" conf="*->default" />
 		<dependency org="com.tdunning" name="t-digest" rev="3.3" />

From 0f801c15874b16217cd78745d4773f2b741a2dce Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Tue, 19 Sep 2023 11:24:58 -0400
Subject: [PATCH 13/28] NUTCH-2959 -- downgrade commons-io to match the version
 we expect to come out with Hadoop 3.4.0.

---
 ivy/ivy.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 2ef3599ab3..b391649ea7 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -45,7 +45,9 @@
 		<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />
 		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.13" conf="*->master" />
 		<dependency org="commons-codec" name="commons-codec" rev="1.16.0" conf="*->default" />
-		<dependency org="commons-io" name="commons-io" rev="2.13.0" conf="*->default" />
+		<!-- hadoop 3.4.0 should have 2.11.0; Tika is broken in distributed mode until then;
+		 		see https://github.com/apache/nutch/pull/776 -->
+		<dependency org="commons-io" name="commons-io" rev="2.11.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-compress" rev="1.23.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-jexl3" rev="3.2.1" conf="*->default" />
 		<dependency org="com.tdunning" name="t-digest" rev="3.3" />

From 5be64d2dad755f55980a1ea767abfb8e9fcc808a Mon Sep 17 00:00:00 2001
From: tballison <tallison@apache.org>
Date: Mon, 25 Sep 2023 09:09:20 -0400
Subject: [PATCH 14/28] NUTCH-3004 -- propagate ssl exception if message
 doesn't match "handshake alert..."

---
 .../src/java/org/apache/nutch/protocol/http/HttpResponse.java | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 260a7c19c2..48918dc514 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -150,6 +150,10 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
                   + e.getMessage();
               throw new HttpException(msg);
             }
+          } else {
+            String msg = "SSL connect to " + url + " failed with: "
+                    + e.getMessage();
+            throw new HttpException(msg, e);
           }
         }
         socket = sslsocket;

From 417b8773231136eb48957f743c2bc3c21f624d4e Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Thu, 28 Sep 2023 12:05:50 +0200
Subject: [PATCH 15/28] NUTCH-2852 SpotBugs: Method invokes System.exit(...) -
 remove all calls of System.exit(...) in methods   except main(args) of
 various "checker" tools

---
 .../org/apache/nutch/indexer/IndexingFiltersChecker.java | 4 ++--
 src/java/org/apache/nutch/net/URLFilterChecker.java      | 4 ++--
 src/java/org/apache/nutch/net/URLNormalizerChecker.java  | 4 ++--
 src/java/org/apache/nutch/parse/ParserChecker.java       | 4 ++--
 src/java/org/apache/nutch/util/AbstractChecker.java      | 9 ++++-----
 5 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 3aa7a05cba..1931c360d8 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -93,7 +93,7 @@ public int run(String[] args) throws Exception {
     // Print help when no args given
     if (args.length < 1) {
       System.err.println(usage);
-      System.exit(-1);
+      return -1;
     }
 
     // read property "doIndex" for back-ward compatibility
@@ -126,7 +126,7 @@ public int run(String[] args) throws Exception {
       } else if (i != args.length - 1) {
         System.err.println("ERR: Not a recognized argument: " + args[i]);
         System.err.println(usage);
-        System.exit(-1);
+        return -1;
       } else {
         url = args[i];
       }
diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java
index 7916cc5794..821f2e9267 100644
--- a/src/java/org/apache/nutch/net/URLFilterChecker.java
+++ b/src/java/org/apache/nutch/net/URLFilterChecker.java
@@ -41,7 +41,7 @@ public int run(String[] args) throws Exception {
     // Print help when no args given
     if (args.length < 1) {
       System.err.println(usage);
-      System.exit(-1);
+      return -1;
     }
 
     int numConsumed;
@@ -53,7 +53,7 @@ public int run(String[] args) throws Exception {
       } else {
         System.err.println("ERROR: Not a recognized argument: " + args[i]);
         System.err.println(usage);
-        System.exit(-1);
+        return -1;
       }
     }
 
diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
index 586c7b2460..46fdd38cfb 100644
--- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
+++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -44,7 +44,7 @@ public int run(String[] args) throws Exception {
     // Print help when no args given
     if (args.length < 1) {
       System.err.println(usage);
-      System.exit(-1);
+      return -1;
     }
 
     int numConsumed;
@@ -58,7 +58,7 @@ public int run(String[] args) throws Exception {
       } else {
         System.err.println("ERROR: Not a recognized argument: " + args[i]);
         System.err.println(usage);
-        System.exit(-1);
+        return -1;
       }
     }
 
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java
index 1533ab57cc..10eec4b244 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -104,7 +104,7 @@ public int run(String[] args) throws Exception {
     // Print help when no args given
     if (args.length < 1) {
       System.err.println(usage);
-      System.exit(-1);
+      return -1;
     }
 
     // initialize plugins early to register URL stream handlers to support
@@ -138,7 +138,7 @@ public int run(String[] args) throws Exception {
       } else if (i != args.length - 1) {
         System.err.println("ERR: Not a recognized argument: " + args[i]);
         System.err.println(usage);
-        System.exit(-1);
+        return -1;
       } else {
         url = args[i];
       }
diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java b/src/java/org/apache/nutch/util/AbstractChecker.java
index 3116ede146..1374812250 100644
--- a/src/java/org/apache/nutch/util/AbstractChecker.java
+++ b/src/java/org/apache/nutch/util/AbstractChecker.java
@@ -72,8 +72,7 @@ protected int parseArgs(String[] args, int i) {
   protected int run() throws Exception {
     // In listening mode?
     if (tcpPort != -1) {
-      processTCP(tcpPort);
-      return 0;
+      return processTCP(tcpPort);
     } else if (stdin) {
       return processStdin();
     }
@@ -104,7 +103,7 @@ protected int processStdin() throws Exception {
 
   // Open TCP socket and process input
   @SuppressWarnings("resource")
-  protected void processTCP(int tcpPort) throws Exception {
+  protected int processTCP(int tcpPort) throws Exception {
     ServerSocket server = null;
 
     try {
@@ -113,7 +112,7 @@ protected void processTCP(int tcpPort) throws Exception {
       LOG.info(server.toString());
     } catch (Exception e) {
       LOG.error("Could not listen on port " + tcpPort, e);
-      System.exit(-1);
+      return -1;
     }
     
     while(true){
@@ -124,7 +123,7 @@ protected void processTCP(int tcpPort) throws Exception {
         thread.start();
       } catch (Exception e) {
         LOG.error("Accept failed: " + tcpPort, e);
-        System.exit(-1);
+        return -1;
       }
     }
   }

From a72a53a32d2183f8a8baefbd50afd007279e4857 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Thu, 28 Sep 2023 12:26:29 +0200
Subject: [PATCH 16/28] NUTCH-3007 Fix impossible casts - remove code blocks
 (else clauses) unneeded and containing   impossible casts

---
 src/java/org/apache/nutch/fetcher/Fetcher.java    | 13 ++-----------
 src/java/org/apache/nutch/parse/ParseSegment.java | 13 ++-----------
 2 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 7cc87f40c6..3727dcebef 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -598,20 +598,11 @@ public Map<String, Object> run(Map<String, Object> args, String crawlId) throws
     Path segment = null;
     if(args.containsKey(Nutch.ARG_SEGMENTS)) {
       Object seg = args.get(Nutch.ARG_SEGMENTS);
-      if(seg instanceof Path) {
+      if (seg instanceof Path) {
         segment = (Path) seg;
-      }
-      else if(seg instanceof String){
+      } else if (seg instanceof String) {
         segment = new Path(seg.toString());
       }
-      else if(seg instanceof ArrayList) {
-        String[] segmentsArray = (String[])seg;
-        segment = new Path(segmentsArray[0].toString());
-        	  
-        if(segmentsArray.length > 1){
-       	  LOG.warn("Only the first segment of segments array is used.");
-        }
-      }
     }
     else {
       String segmentDir = crawlId+"/segments";
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
index 7e4707d399..c4e271feec 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -312,20 +312,11 @@ public Map<String, Object> run(Map<String, Object> args, String crawlId) throws
     Path segment = null;
     if(args.containsKey(Nutch.ARG_SEGMENTS)) {
       Object seg = args.get(Nutch.ARG_SEGMENTS);
-      if(seg instanceof Path) {
+      if (seg instanceof Path) {
         segment = (Path) seg;
-      }
-      else if(seg instanceof String){
+      } else if (seg instanceof String) {
         segment = new Path(seg.toString());
       }
-      else if(seg instanceof ArrayList) {
-        String[] segmentsArray = (String[])seg;
-        segment = new Path(segmentsArray[0].toString());
-        	  
-        if(segmentsArray.length > 1){
-       	  LOG.warn("Only the first segment of segments array is used.");
-        }
-      }
     }
     else {
     	String segment_dir = crawlId+"/segments";

From 810b1d6ad50fa9021469b4ca5e1db9050a3263c5 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Sat, 30 Sep 2023 08:09:18 +0200
Subject: [PATCH 17/28] NUTCH-3010 Injector: count unique number of injected
 URLs - add counter urls_injected_unique - improve log messages reporting the
 counts of injected/merged URLs

---
 src/java/org/apache/nutch/crawl/Injector.java | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java
index b93e8ca76a..9fca719f62 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -341,8 +341,11 @@ public void reduce(Text key, Iterable<CrawlDatum> values, Context context)
               ? injected.getFetchInterval() : old.getFetchInterval());
         }
       }
-      if (injectedSet && oldSet) {
-        context.getCounter("injector", "urls_merged").increment(1);
+      if (injectedSet) {
+        context.getCounter("injector", "urls_injected_unique").increment(1);
+        if (oldSet) {
+          context.getCounter("injector", "urls_merged").increment(1);
+        }
       }
       context.write(key, result);
     }
@@ -448,22 +451,24 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
       if (LOG.isInfoEnabled()) {
         long urlsInjected = job.getCounters()
             .findCounter("injector", "urls_injected").getValue();
+        long urlsInjectedUniq = job.getCounters()
+            .findCounter("injector", "urls_injected_unique").getValue();
         long urlsFiltered = job.getCounters()
             .findCounter("injector", "urls_filtered").getValue();
         long urlsMerged = job.getCounters()
             .findCounter("injector", "urls_merged").getValue();
-        long urlsPurged404= job.getCounters()
+        long urlsPurged404 = job.getCounters()
             .findCounter("injector", "urls_purged_404").getValue();
-        long urlsPurgedFilter= job.getCounters()
+        long urlsPurgedFilter = job.getCounters()
             .findCounter("injector", "urls_purged_filter").getValue();
-        LOG.info("Injector: Total urls rejected by filters: " + urlsFiltered);
+        LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered);
         LOG.info(
-            "Injector: Total urls injected after normalization and filtering: "
-                + urlsInjected);
-        LOG.info("Injector: Total urls injected but already in CrawlDb: "
-            + urlsMerged);
-        LOG.info("Injector: Total new urls injected: "
-            + (urlsInjected - urlsMerged));
+            "Injector: Total urls injected after normalization and filtering: {} (unique URLs: {})",
+            urlsInjected, urlsInjectedUniq);
+        LOG.info("Injector: Total urls injected but already in CrawlDb: {}",
+            urlsMerged);
+        LOG.info("Injector: Total new urls injected: {}",
+            (urlsInjectedUniq - urlsMerged));
         if (filterNormalizeAll) {
           LOG.info("Injector: Total urls removed from CrawlDb by filters: {}",
               urlsPurgedFilter);
@@ -475,8 +480,8 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
         }
 
         long end = System.currentTimeMillis();
-        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
-            + TimingUtil.elapsedTime(start, end));
+        LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end),
+            TimingUtil.elapsedTime(start, end));
       }
     } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) {
       LOG.error("Injector job failed: {}", e.getMessage());

From a1ab4333e0a1a28ac2e0f9c75871f7feeb5f2f81 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Sat, 30 Sep 2023 11:12:07 +0200
Subject: [PATCH 18/28] NUTCH-2897 Do not supress deprecated API warnings -
 deprecate constructor of NutchJob - remove deprocated call to
 Object.finalize() from Plugin.finalize()

---
 src/java/org/apache/nutch/plugin/Plugin.java |  2 --
 src/java/org/apache/nutch/util/NutchJob.java | 13 ++++++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/nutch/plugin/Plugin.java b/src/java/org/apache/nutch/plugin/Plugin.java
index b2e717d20e..3a0fb2e915 100644
--- a/src/java/org/apache/nutch/plugin/Plugin.java
+++ b/src/java/org/apache/nutch/plugin/Plugin.java
@@ -90,9 +90,7 @@ private void setDescriptor(PluginDescriptor descriptor) {
   }
 
   @Override
-  @SuppressWarnings("deprecation")
   protected void finalize() throws Throwable {
-    super.finalize();
     shutDown();
   }
 }
diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java
index 478b24f89e..068c64fefb 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -35,7 +35,18 @@ public class NutchJob extends Job {
 
   private static final String JOB_FAILURE_LOG_FORMAT = "%s job did not succeed, job id: %s, job status: %s, reason: %s";
 
-  @SuppressWarnings("deprecation")
+  /**
+   * @deprecated, use instead {@link #getInstance(Configuration)} or
+   * {@link Job#getInstance(Configuration, String)}.
+   * 
+   * @param conf
+   *          configuration for the job
+   * @param jobName
+   *          name of the job
+   * @throws IOException
+   *           see {@link Job#Job(Configuration, String)}
+   */
+  @Deprecated
   public NutchJob(Configuration conf, String jobName) throws IOException {
     super(conf, jobName);
     if (conf != null) {

From a74b57b90409b9488caa169e7bc3c6d1ff8067f4 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Sat, 30 Sep 2023 11:16:59 +0200
Subject: [PATCH 19/28] NUTCH-2853 bin/nutch: remove deprecated commands
 solrindex, solrdedup, solrclean

---
 src/bin/nutch | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/bin/nutch b/src/bin/nutch
index 5b999fa6f5..561c79e778 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -81,9 +81,6 @@ if [ $# = 0 ]; then
   echo "  dedup             deduplicate entries in the crawldb and give them a special status"
   echo "  dump              exports crawled data from segments into files"
   echo "  commoncrawldump   exports crawled data from segments into common crawl data format encoded as CBOR"
-  echo "  solrindex         run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
-  echo "  solrdedup         remove duplicates from solr - DEPRECATED use the dedup command instead"
-  echo "  solrclean         remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
   echo "  clean             remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
   echo "  parsechecker      check the parser for a given url"
   echo "  indexchecker      check the indexing filters for a given url"
@@ -253,19 +250,14 @@ elif [ "$COMMAND" = "dump" ] ; then
   CLASS=org.apache.nutch.tools.FileDumper
 elif [ "$COMMAND" = "commoncrawldump" ] ; then
   CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
-elif [ "$COMMAND" = "solrindex" ] ; then
-  CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
-  shift
+elif [ "$COMMAND" = "solrindex" ] || [ "$COMMAND" = "solrdedup" ] || [ "$COMMAND" = "solrclean" ]; then
+  REPLACEMENT="${COMMAND#solr}"
+  echo "The command $COMMAND was replaced by the command $REPLACEMENT"
+  exit -1
 elif [ "$COMMAND" = "index" ] ; then
   CLASS=org.apache.nutch.indexer.IndexingJob
-elif [ "$COMMAND" = "solrdedup" ] ; then
-  echo "Command $COMMAND is deprecated, please use dedup instead"
-  exit -1
 elif [ "$COMMAND" = "dedup" ] ; then
   CLASS=org.apache.nutch.crawl.DeduplicationJob
-elif [ "$COMMAND" = "solrclean" ] ; then
-  CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
-  shift; shift
 elif [ "$COMMAND" = "clean" ] ; then
   CLASS=org.apache.nutch.indexer.CleaningJob
 elif [ "$COMMAND" = "parsechecker" ] ; then

From 9faf364a7fa1631f553a36b8234c1169eba0f5c3 Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Tue, 3 Oct 2023 10:48:09 -0400
Subject: [PATCH 20/28] Working now locally and with Seb's single_node_cluster
 tests

---
 ivy/ivy.xml                               |  2 +-
 ivy/ivysettings.xml                       |  7 ++
 src/plugin/language-identifier/ivy.xml    |  8 +--
 src/plugin/language-identifier/plugin.xml | 11 +--
 src/plugin/parse-tika/ivy.xml             | 19 +----
 src/plugin/parse-tika/plugin.xml          | 84 +----------------------
 6 files changed, 12 insertions(+), 119 deletions(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index b391649ea7..6f39262449 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -70,7 +70,7 @@
 			<exclude org="org.slf4j" name="*" />
 		</dependency><!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.apache.tika" name="tika-core" rev="2.9.0" />
+		<dependency org="org.tallison.tika" name="tika-core-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
 
 		<dependency org="xml-apis" name="xml-apis" rev="1.4.01" /><!-- force this version as it is required by Tika -->
 		<dependency org="xerces" name="xercesImpl" rev="2.12.2" />
diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml
index 18038a5ca4..a060df5b69 100644
--- a/ivy/ivysettings.xml
+++ b/ivy/ivysettings.xml
@@ -32,7 +32,14 @@
   <!-- pull in the local repository -->
   <include url="${ivy.default.conf.dir}/ivyconf-local.xml"/>
   <settings defaultResolver="default"/>
+  <property name="local-maven2-dir" value="${user.home}/.m2/repository/" />
   <resolvers>
+    <filesystem name="local-maven-2" m2compatible="true">
+      <artifact
+          pattern="${local-maven2-dir}/[organisation]/[module]/[revision]/[module]-[revision].[ext]" />
+      <ivy
+          pattern="${local-maven2-dir}/[organisation]/[module]/[revision]/[module]-[revision].pom" />
+    </filesystem>
     <ibiblio name="maven2"
       root="${repo.maven.org}"
       pattern="${maven2.pattern.ext}"
diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml
index 5c357a75b6..68e9ed76e1 100644
--- a/src/plugin/language-identifier/ivy.xml
+++ b/src/plugin/language-identifier/ivy.xml
@@ -36,13 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-langdetect-optimaize" rev="2.9.0" conf="*->default">
-      <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
-      <exclude org="org.apache.tika" name="tika-core" />
-      <exclude org="com.google.guava" name="guava" />
-      <exclude org="org.slf4j" name="slf4j-api" />
-      <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
-    </dependency>
+    <dependency org="org.tallison.tika" name="tika-langdetect-optimaize-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml
index 28cfd70317..dab1a52f31 100644
--- a/src/plugin/language-identifier/plugin.xml
+++ b/src/plugin/language-identifier/plugin.xml
@@ -26,16 +26,7 @@
          <export name="*"/>
       </library>
       <!-- dependencies of Tika's Optimaize language detector (tika-langdetect-optimaize) -->
-      <library name="annotations-12.0.jar"/>
-      <library name="checker-qual-3.33.0.jar"/>
-      <library name="error_prone_annotations-2.18.0.jar"/>
-      <library name="failureaccess-1.0.1.jar"/>
-      <library name="j2objc-annotations-2.8.jar"/>
-      <library name="jsonic-1.2.11.jar"/>
-      <library name="jsr305-3.0.2.jar"/>
-      <library name="language-detector-0.6.jar"/>
-      <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
-      <library name="tika-langdetect-optimaize-2.9.0.jar"/>
+      <library name="tika-langdetect-optimaize-shaded-2.9.0.0.jar"/>
     </runtime>
 
    <requires>
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index d4718ea498..1586d9661f 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,24 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers-standard-package" rev="2.9.0" conf="*->default">
-      <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
-      <exclude org="org.apache.tika" name="tika-core" />
-      <exclude org="org.apache.commons" name="commons-lang3" />
-      <exclude org="org.apache.commons" name="commons-collections4" />
-      <exclude org="commons-io" name="commons-io" />
-      <exclude org="commons-logging" name="commons-logging" />
-      <exclude org="org.slf4j" name="slf4j-api" />
-      <!-- exclusion of Xerces and xml-apis is mandatory so that there
-           are no instances in the child/plugin class loader -->
-      <exclude org="xerces" name="xercesImpl" />
-      <exclude org="xml-apis" name="xml-apis" />
-      <!-- common-codec and commons-compress must be included -->
-      <!--exclude org="org.apache.commons" name="commons-codec" /-->
-      <!--exclude org="commons-codec" name="commons-codec" /-->
-      <!--exclude org="org.apache.commons" name="commons-compress" /-->
-    </dependency>
-    <dependency org="org.apache.tika" name="tika-parser-html-commons" rev="2.9.0" conf="*->default"/>
+    <dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index 3bc0a822cb..dd4fe7fde8 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -25,89 +25,7 @@
       <library name="parse-tika.jar">
          <export name="*"/>
       </library>
-      <!-- dependencies of Tika (tika-parsers) -->
-      <library name="apache-mime4j-core-0.8.9.jar"/>
-      <library name="apache-mime4j-dom-0.8.9.jar"/>
-      <library name="asm-9.5.jar"/>
-      <library name="bcmail-jdk18on-1.76.jar"/>
-      <library name="bcpkix-jdk18on-1.76.jar"/>
-      <library name="bcprov-jdk18on-1.76.jar"/>
-      <library name="bcutil-jdk18on-1.76.jar"/>
-      <library name="boilerpipe-1.1.0.jar"/>
-      <library name="commons-codec-1.16.0.jar"/>
-      <library name="commons-compress-1.23.0.jar"/>
-      <library name="commons-csv-1.10.0.jar"/>
-      <library name="commons-exec-1.3.jar"/>
-      <library name="commons-math3-3.6.1.jar"/>
-      <library name="curvesapi-1.07.jar"/>
-      <library name="dd-plist-1.27.jar"/>
-      <library name="dec-0.1.2.jar"/>
-      <library name="fontbox-2.0.29.jar"/>
-      <library name="istack-commons-runtime-3.0.12.jar"/>
-      <library name="jackcess-4.0.5.jar"/>
-      <library name="jackcess-encrypt-4.0.2.jar"/>
-      <library name="jackson-annotations-2.15.2.jar"/>
-      <library name="jackson-core-2.15.2.jar"/>
-      <library name="jackson-databind-2.15.2.jar"/>
-      <library name="jai-imageio-core-1.4.0.jar"/>
-      <library name="jakarta.activation-1.2.2.jar"/>
-      <library name="jakarta.xml.bind-api-2.3.3.jar"/>
-      <library name="java-libpst-0.9.3.jar"/>
-      <library name="jaxb-runtime-2.3.6.jar"/>
-      <library name="jbig2-imageio-3.0.4.jar"/>
-      <library name="jcl-over-slf4j-2.0.7.jar"/>
-      <library name="jdom2-2.0.6.1.jar"/>
-      <library name="jempbox-1.8.17.jar"/>
-      <library name="jhighlight-1.1.0.jar"/>
-      <library name="jmatio-1.5.jar"/>
-      <library name="juniversalchardet-2.4.0.jar"/>
-      <library name="junrar-7.5.5.jar"/>
-      <library name="jwarc-0.28.1.jar"/>
-      <library name="log4j-api-2.20.0.jar"/>
-      <library name="metadata-extractor-2.18.0.jar"/>
-      <library name="parso-2.0.14.jar"/>
-      <library name="pdfbox-2.0.29.jar"/>
-      <library name="pdfbox-tools-2.0.29.jar"/>
-      <library name="poi-5.2.3.jar"/>
-      <library name="poi-ooxml-5.2.3.jar"/>
-      <library name="poi-ooxml-lite-5.2.3.jar"/>
-      <library name="poi-scratchpad-5.2.3.jar"/>
-      <library name="rome-2.1.0.jar"/>
-      <library name="rome-utils-2.1.0.jar"/>
-      <library name="SparseBitSet-1.2.jar"/>
-      <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parser-apple-module-2.9.0.jar"/>
-      <library name="tika-parser-audiovideo-module-2.9.0.jar"/>
-      <library name="tika-parser-cad-module-2.9.0.jar"/>
-      <library name="tika-parser-code-module-2.9.0.jar"/>
-      <library name="tika-parser-crypto-module-2.9.0.jar"/>
-      <library name="tika-parser-digest-commons-2.9.0.jar"/>
-      <library name="tika-parser-font-module-2.9.0.jar"/>
-      <library name="tika-parser-html-commons-2.9.0.jar"/>
-      <library name="tika-parser-html-module-2.9.0.jar"/>
-      <library name="tika-parser-image-module-2.9.0.jar"/>
-      <library name="tika-parser-mail-commons-2.9.0.jar"/>
-      <library name="tika-parser-mail-module-2.9.0.jar"/>
-      <library name="tika-parser-microsoft-module-2.9.0.jar"/>
-      <library name="tika-parser-miscoffice-module-2.9.0.jar"/>
-      <library name="tika-parser-news-module-2.9.0.jar"/>
-      <library name="tika-parser-ocr-module-2.9.0.jar"/>
-      <library name="tika-parser-pdf-module-2.9.0.jar"/>
-      <library name="tika-parser-pkg-module-2.9.0.jar"/>
-      <library name="tika-parsers-standard-package-2.9.0.jar"/>
-      <library name="tika-parser-text-module-2.9.0.jar"/>
-      <library name="tika-parser-webarchive-module-2.9.0.jar"/>
-      <library name="tika-parser-xml-module-2.9.0.jar"/>
-      <library name="tika-parser-xmp-commons-2.9.0.jar"/>
-      <library name="tika-parser-zip-commons-2.9.0.jar"/>
-      <library name="txw2-2.3.6.jar"/>
-      <library name="vorbis-java-core-0.8.jar"/>
-      <library name="vorbis-java-tika-0.8.jar"/>
-      <library name="xmlbeans-5.1.1.jar"/>
-      <library name="xmpbox-2.0.29.jar"/>
-      <library name="xmpcore-6.1.11.jar"/>
-      <library name="xz-1.9.jar"/>
-      <!-- end of dependencies of Tika (tika-parsers) -->
+      <library name="tika-parsers-standard-package-shaded-2.9.0.0.jar"/>
    </runtime>
 
    <requires>

From 9aabc459a4525f1f50f4597cf39599441403cc69 Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Tue, 3 Oct 2023 11:02:54 -0400
Subject: [PATCH 21/28] update howto_upgrade_tika.txt

---
 src/plugin/parse-tika/howto_upgrade_tika.txt | 22 ++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
index 2dcf76c455..46d075948b 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -1,3 +1,25 @@
+We are currently using a shim (https://github.com/tballison/hadoop-safe-tika
+because of binary conflicts in commons-io versions between what Hadoop supports and the more
+modern features that Apache Tika and Apache POI were using in commons-io.
+
+For now, all you have to do is update the fat jar dependencies:
+
+1. tika-core-shaded in ivy/ivy.xml
+
+2. tika-parsers-standard-package-shaded in src/plugin/parse-tika/ivy.xml
+
+3. The library name version for tika-parsers-standard-package-shaded in src/plugin/parse-tika/plugin.xml
+
+4. Repeat steps 2 and 3 for the language-identifier
+
+5. Build Nutch and run all unit tests:
+
+    $ cd ../../../
+    $ ant clean runtime test
+
+The following directions are what we used to do with thin jars. Hopefully, we'll
+be able to get back to these directions once we have version harmony with Hadoop and Tika/POI.
+
 1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml
 
 2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml

From e96cfc56ee04c8e7e07e11d4eef521b4674a9ec6 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Tue, 19 Sep 2023 08:10:14 +0200
Subject: [PATCH 22/28] NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header
 metadata lookup should be case-insensitive - implement class
 CaseInsensitiveMetadata providing case-insensitive   metadata look-ups (but
 no spell-checking) - use CaseInsensitiveMetadata to hold HTTP header metadata
 in   in the class OkHttpResponse of protocol-okhttp - add unit tests to prove
 the fix (and also case-insensitive look-ups   and spell-checking in
 protocol-http)

---
 .../metadata/CaseInsensitiveMetadata.java     |  33 ++++
 .../org/apache/nutch/metadata/Metadata.java   |   4 +-
 .../nutch/metadata/SpellCheckedMetadata.java  |   8 +-
 .../apache/nutch/net/protocols/Response.java  |   2 +-
 .../nutch/protocol/http/TestResponse.java     | 152 +++++++++++++++++
 .../nutch/protocol/okhttp/OkHttpResponse.java |   3 +-
 .../nutch/protocol/okhttp/TestResponse.java   | 154 ++++++++++++++++++
 7 files changed, 348 insertions(+), 8 deletions(-)
 create mode 100644 src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java
 create mode 100644 src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
 create mode 100644 src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java

diff --git a/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java
new file mode 100644
index 0000000000..92e848ca2d
--- /dev/null
+++ b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.util.TreeMap;
+
+/**
+ * A decorator to Metadata that adds for case-insensitive lookup of keys.
+ */
+public class CaseInsensitiveMetadata extends Metadata {
+
+  /**
+   * Constructs a new, empty metadata.
+   */
+  public CaseInsensitiveMetadata() {
+    metadata = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
+  }
+
+}
diff --git a/src/java/org/apache/nutch/metadata/Metadata.java b/src/java/org/apache/nutch/metadata/Metadata.java
index 5c37911fb9..7fa0bb12ce 100644
--- a/src/java/org/apache/nutch/metadata/Metadata.java
+++ b/src/java/org/apache/nutch/metadata/Metadata.java
@@ -36,7 +36,7 @@ public class Metadata implements Writable, CreativeCommons, DublinCore,
   /**
    * A map of all metadata attributes.
    */
-  private Map<String, String[]> metadata = null;
+  protected Map<String, String[]> metadata = null;
 
   /**
    * Constructs a new, empty metadata.
@@ -66,7 +66,7 @@ public String[] names() {
   }
 
   /**
-   * Get the value associated to a metadata name. If many values are assiociated
+   * Get the value associated to a metadata name. If many values are associated
    * to the specified name, then the first one is returned.
    * 
    * @param name
diff --git a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
index fdbf1b62c8..be161440e2 100644
--- a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
+++ b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
@@ -25,7 +25,7 @@
 
 /**
  * A decorator to Metadata that adds spellchecking capabilities to property
- * names. Currently used spelling vocabulary contains just the httpheaders from
+ * names. Currently used spelling vocabulary contains just the HTTP headers from
  * {@link HttpHeaders} class.
  * 
  */
@@ -94,7 +94,7 @@ private static String normalize(final String str) {
   /**
    * Get the normalized name of metadata attribute name. This method tries to
    * find a well-known metadata name (one of the metadata names defined in this
-   * class) that matches the specified name. The matching is error tolerent. For
+   * class) that matches the specified name. The matching is error tolerant. For
    * instance,
    * <ul>
    * <li>content-type gives Content-Type</li>
@@ -105,8 +105,8 @@ private static String normalize(final String str) {
    * name is returned.
    * 
    * @param name
-   *          Name to normalize
-   * @return normalized name
+   *          HTTP header name to normalize
+   * @return normalized HTTP header name
    */
   public static String getNormalizedName(final String name) {
     String searched = normalize(name);
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java
index 0159358ec0..514ce85613 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -86,7 +86,7 @@ public static enum TruncatedContentReason {
 
   /**
    * Get the value of a named header.
-   * @param name key of the header you wish to retreive
+   * @param name key of the header you wish to retrieve
    * @return header value
    */
   public String getHeader(String name);
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
new file mode 100644
index 0000000000..9d65b6df88
--- /dev/null
+++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestResponse extends AbstractHttpProtocolPluginTest {
+
+  protected static final String redirectHeader = "HTTP/1.1 301 Moved Permanently\r\n" //
+      + "Content-Type: text/html; charset=UTF-8\r\n" //
+      + "Content-Length: 0\r\n";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  @Override
+  protected String getPluginClassName() {
+    return "org.apache.nutch.protocol.okhttp.OkHttp";
+  }
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    /*
+     * plugin tests specific config file - needs to add the tested plugin to
+     * plugin.includes
+     */
+    conf.addResource("nutch-site-test.xml");
+    conf.setBoolean("store.http.headers", true);
+
+    http = new Http();
+    http.setConf(conf);
+  }
+
+  protected HttpResponse getResponse(int statusCode, String headerName) {
+    try {
+      URL url = new URL(protocol, localHost, defaultPort, "/" + headerName);
+      LOG.info("Emulating fetch of {}", url);
+      return new HttpResponse((Http) http, url, new CrawlDatum(statusCode, 1000));
+    } catch (ProtocolException | IOException e) {
+      return null;
+    }
+  }
+
+  protected void headerTest(int statusCode, String headerName, String value, String lookupName) {
+    HttpResponse response = getResponse(statusCode, headerName);
+    LOG.info("Response headers:");
+    LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS));
+    assertEquals(
+        "No or unexpected value of header \"" + headerName
+            + "\" returned when retrieving header \"" + lookupName + "\"",
+        value, response.getHeader(lookupName));
+  }
+
+  protected Map<String, byte[]> getResponses(String headerValue) {
+    String[] headerNames = { "Location", "location", "LOCATION", "Loction" };
+    Map<String, byte[]> responses = new TreeMap<>();
+    for (String headerName : headerNames) {
+      responses.put("/" + headerName,
+          (redirectHeader + headerName + ": " + headerValue + "\r\n"
+              + "Content-Length: 0\r\n\r\n").getBytes(UTF_8));
+    }
+    responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": "
+        + headerValue + "\r\n" + simpleContent).getBytes(UTF_8));
+    return responses;
+  }
+
+  @Test
+  public void testGetHeader() throws Exception {
+    String value = "headervalue";
+    launchServer(getResponses(value));
+
+    LOG.info(
+        "Testing standard HTTP header \"Location\": expected case-insensitive and error-tolerant matching");
+    headerTest(301, "Location", value, "Location");
+    headerTest(301, "Location", value, "location");
+    headerTest(301, "location", value, "Location");
+    headerTest(301, "LOCATION", value, "Location");
+    headerTest(301, "Loction", value, "Location");
+
+    LOG.info(
+        "Testing non-standard HTTP header \"MyCustomHeader\": only exact matching");
+    headerTest(200, "MyCustomHeader", value, "MyCustomHeader");
+    /*
+     * The following case-insensitive or approximate look-ups are not supported
+     * for non-standard headers by SpellCheckedMetadata:
+     */
+    // testHeader(200, "MyCustomHeader", value, "mycustomheader");
+    // testHeader(200, "mycustomheader", value, "MyCustomHeader");
+    // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader");
+  }
+
+  @Ignore("Only for benchmarking")
+  @Test
+  public void testMetadataBenchmark() throws MalformedURLException, ProtocolException,
+      IOException, InterruptedException {
+    String value = "headervalue";
+    launchServer(getResponses(value));
+    Thread.sleep(30000); // time to attach a profiler
+    int iterations = 4000;
+    LOG.info("Starting benchmark with {} iterations ({} calls)", iterations,
+        (iterations * 5));
+    long start = System.currentTimeMillis();
+    for (int i = 0; i < iterations; i++) {
+      headerTest(301, "Location", value, "Location");
+      headerTest(301, "Location", value, "location");
+      headerTest(301, "location", value, "Location");
+      headerTest(301, "LOCATION", value, "Location");
+      headerTest(301, "Loction", value, "Location");
+    }
+    long elapsed = System.currentTimeMillis() - start;
+    LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed,
+        (elapsed / (5.0 * iterations)));
+  }
+
+}
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
index 67bc45b035..605c03390f 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -24,6 +24,7 @@
 
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.CaseInsensitiveMetadata;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
@@ -106,7 +107,7 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum)
     // ensure that Response and underlying ResponseBody are closed
     try (okhttp3.Response response = call.execute()) {
 
-      Metadata responsemetadata = new Metadata();
+      Metadata responsemetadata = new CaseInsensitiveMetadata();
       okhttp3.Headers httpHeaders = response.headers();
 
       for (int i = 0, size = httpHeaders.size(); i < size; i++) {
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java
new file mode 100644
index 0000000000..695a6c539c
--- /dev/null
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.okhttp;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestResponse extends AbstractHttpProtocolPluginTest {
+
+  protected static final String redirectHeader = "HTTP/1.1 301 Moved Permanently\r\n" //
+      + "Content-Type: text/html; charset=UTF-8\r\n" //
+      + "Content-Length: 0\r\n";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  @Override
+  protected String getPluginClassName() {
+    return "org.apache.nutch.protocol.okhttp.OkHttp";
+  }
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    /*
+     * plugin tests specific config file - needs to add the tested plugin to
+     * plugin.includes
+     */
+    conf.addResource("nutch-site-test.xml");
+    conf.setBoolean("store.http.headers", true);
+
+    http = new OkHttp();
+    http.setConf(conf);
+  }
+
+  protected OkHttpResponse getResponse(int statusCode, String headerName) {
+    try {
+      URL url = new URL(protocol, localHost, defaultPort, "/" + headerName);
+      LOG.info("Emulating fetch of {}", url);
+      return new OkHttpResponse((OkHttp) http, url, new CrawlDatum(statusCode, 1000));
+    } catch (ProtocolException | IOException e) {
+      return null;
+    }
+  }
+
+  protected void headerTest(int statusCode, String headerName, String value, String lookupName) {
+    OkHttpResponse response = getResponse(statusCode, headerName);
+    LOG.info("Response headers:");
+    LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS));
+    assertEquals(
+        "No or unexpected value of header \"" + headerName
+            + "\" returned when retrieving header \"" + lookupName + "\"",
+        value, response.getHeader(lookupName));
+  }
+
+  protected Map<String, byte[]> getResponses(String headerValue) {
+    String[] headerNames = { "Location", "location", "LOCATION", "Loction" };
+    Map<String, byte[]> responses = new TreeMap<>();
+    for (String headerName : headerNames) {
+      responses.put("/" + headerName,
+          (redirectHeader + headerName + ": " + headerValue + "\r\n"
+              + "Content-Length: 0\r\n\r\n").getBytes(UTF_8));
+    }
+    responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": "
+        + headerValue + "\r\n" + simpleContent).getBytes(UTF_8));
+    return responses;
+  }
+
+  @Test
+  public void testGetHeader() throws Exception {
+    String value = "headervalue";
+    launchServer(getResponses(value));
+
+    LOG.info(
+        "Testing standard HTTP header \"Location\": expected case-insensitive and error-tolerant matching");
+    headerTest(301, "Location", value, "Location");
+    headerTest(301, "Location", value, "location");
+    headerTest(301, "location", value, "Location");
+    headerTest(301, "LOCATION", value, "Location");
+    // only with SpellCheckedMetadata:
+    // headerTest(301, "Loction", value, "Location");
+
+    LOG.info(
+        "Testing non-standard HTTP header \"MyCustomHeader\": only exact matching");
+    headerTest(200, "MyCustomHeader", value, "MyCustomHeader");
+    /*
+     * The following case-insensitive or approximate look-ups are not supported
+     * for non-standard headers by SpellCheckedMetadata:
+     */
+    // testHeader(200, "MyCustomHeader", value, "mycustomheader");
+    // testHeader(200, "mycustomheader", value, "MyCustomHeader");
+    // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader");
+  }
+
+  @Ignore("Only for benchmarking")
+  @Test
+  public void testMetadataBenchmark() throws MalformedURLException, ProtocolException,
+      IOException, InterruptedException {
+    String value = "headervalue";
+    launchServer(getResponses(value));
+    Thread.sleep(30000); // time to attach a profiler
+    int iterations = 5000;
+    LOG.info("Starting benchmark with {} iterations ({} calls)", iterations,
+        (iterations * 4));
+    long start = System.currentTimeMillis();
+    for (int i = 0; i < iterations; i++) {
+      headerTest(301, "Location", value, "Location");
+      headerTest(301, "Location", value, "location");
+      headerTest(301, "location", value, "Location");
+      headerTest(301, "LOCATION", value, "Location");
+      // only with SpellCheckedMetadata:
+      // headerTest(301, "Loction", value, "Location");
+    }
+    long elapsed = System.currentTimeMillis() - start;
+    LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed,
+        (elapsed / (4.0 * iterations)));
+  }
+
+}

From bb68385f9601b37c61ef5a2baac58740c975bddb Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Thu, 28 Sep 2023 14:53:02 +0200
Subject: [PATCH 23/28] NUTCH-3009 Upgrade to Hadoop 3.3.6

---
 default.properties | 2 +-
 ivy/ivy.xml        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/default.properties b/default.properties
index 17e0bffbbc..06f2ed0096 100644
--- a/default.properties
+++ b/default.properties
@@ -44,7 +44,7 @@ test.junit.output.format = plain
 javadoc.proxy.host=-J-DproxyHost=
 javadoc.proxy.port=-J-DproxyPort=
 javadoc.link.java=https://docs.oracle.com/en/java/javase/11/docs/api/
-javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.3.4/api/
+javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.3.6/api/
 javadoc.packages=org.apache.nutch.*
 
 dist.dir=./dist
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 6f39262449..e5ae3882f5 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -53,19 +53,19 @@
 		<dependency org="com.tdunning" name="t-digest" rev="3.3" />
 
 		<!-- Hadoop Dependencies -->
-		<dependency org="org.apache.hadoop" name="hadoop-common" rev="3.3.4" conf="*->default">
+		<dependency org="org.apache.hadoop" name="hadoop-common" rev="3.3.6" conf="*->default">
 			<exclude org="ch.qos.reload4j" name="*"/>
 			<exclude org="org.slf4j" name="*" />
 		</dependency>
-		<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="3.3.4" conf="*->default">
+		<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="3.3.6" conf="*->default">
 			<exclude org="ch.qos.reload4j" name="*"/>
 			<exclude org="org.slf4j" name="*" />
 		</dependency>
-		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="3.3.4" conf="*->default">
+		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="3.3.6" conf="*->default">
 			<exclude org="ch.qos.reload4j" name="*"/>
 			<exclude org="org.slf4j" name="*" />
 		</dependency>
-		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.3.4" conf="*->default">
+		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.3.6" conf="*->default">
 			<exclude org="ch.qos.reload4j" name="*"/>
 			<exclude org="org.slf4j" name="*" />
 		</dependency><!-- End of Hadoop Dependencies -->

From ecdd19dbdd4424bf9b9bce206f23992140ee43fe Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Sat, 21 Oct 2023 15:53:25 +0200
Subject: [PATCH 24/28] NUTCH-2990 HttpRobotRulesParser to follow 5 redirects
 as specified by RFC 9309 (#779)

- follow multiple redirects when fetching robots.txt
- number of followed redirects is configurable by the property
  http.robots.redirect.max (default: 5)

Improvements to RobotRulesParser's robots.txt test utility
- bug fix: the passed agent names need to be transferred
  to the property http.robots.agents earlier, before the
  protocol plugins are configured
- more verbose debug logging
---
 conf/nutch-default.xml                        |  10 ++
 .../nutch/protocol/RobotRulesParser.java      |  32 ++--
 .../http/api/HttpRobotRulesParser.java        | 141 ++++++++++++++----
 3 files changed, 143 insertions(+), 40 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 58455b338c..18ed56b037 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -163,6 +163,16 @@
   </description>
 </property>
 
+<property>
+  <name>http.robots.redirect.max</name>
+  <value>5</value>
+  <description>Maximum number of redirects followed when fetching
+  a robots.txt file. RFC 9309 specifies that &quot;crawlers SHOULD
+  follow at least five consecutive redirects, even across authorities
+  (for example, hosts in the case of HTTP).&quot;
+  </description>
+</property>
+
 <property>
   <name>http.agent.description</name>
   <value></value>
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 562c2c694f..d73c075060 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -98,6 +98,7 @@ public abstract class RobotRulesParser implements Tool {
 
   protected Configuration conf;
   protected Set<String> agentNames;
+  protected int maxNumRedirects = 5;
 
   /** set of host names or IPs to be explicitly excluded from robots.txt checking */
   protected Set<String> allowList = new HashSet<>();
@@ -149,6 +150,10 @@ public void setConf(Configuration conf) {
         }
       }
     }
+    LOG.info("Checking robots.txt for the following agent names: {}", agentNames);
+
+    maxNumRedirects = conf.getInt("http.robots.redirect.max", 5);
+    LOG.info("Following max. {} robots.txt redirects", maxNumRedirects);
 
     String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
     if (confAllowList == null) {
@@ -294,8 +299,11 @@ public int run(String[] args) {
           "",
           "<robots-file-or-url>\tlocal file or URL parsed as robots.txt file",
           "\tIf <robots-file-or-url> starts with a protocol specification",
-          "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched",
-          "\tusing the specified protocol. Otherwise, a local file is assumed.",
+          "\t(`http', `https', `ftp' or `file'), the URL is parsed, URL path",
+          "\tand query are removed and the path \"/robots.txt\" is appended.",
+          "\tThe resulting URL (the canonical robots.txt location) is then",
+          "\tfetched using the specified protocol.",
+          "\tIf the URL does not include a protocol, a local file is assumed.",
           "",
           "<url-file>\tlocal file with URLs (one per line), for every URL",
           "\tthe path part (including the query) is checked whether",
@@ -323,6 +331,16 @@ public int run(String[] args) {
       return -1;
     }
 
+    if (args.length > 2) {
+      // set agent name from command-line in configuration
+      // Note: when fetching via protocol this must be done
+      // before the protocol is configured
+      String agents = args[2];
+      conf.set("http.robots.agents", agents);
+      conf.set("http.agent.name", agents.split(",")[0]);
+      setConf(conf);
+    }
+
     Protocol protocol = null;
     URL robotsTxtUrl = null;
     if (args[0].matches("^(?:https?|ftp|file)://?.*")) {
@@ -334,6 +352,7 @@ public int run(String[] args) {
       ProtocolFactory factory = new ProtocolFactory(conf);
       try {
         protocol = factory.getProtocol(robotsTxtUrl);
+        LOG.debug("Using protocol {} to fetch robots.txt", protocol.getClass());
       } catch (ProtocolNotFound e) {
         LOG.error("No protocol found for {}: {}", args[0],
             StringUtils.stringifyException(e));
@@ -357,14 +376,6 @@ public int run(String[] args) {
 
     File urlFile = new File(args[1]);
 
-    if (args.length > 2) {
-      // set agent name from command-line in configuration and update parser
-      String agents = args[2];
-      conf.set("http.robots.agents", agents);
-      conf.set("http.agent.name", agents.split(",")[0]);
-      setConf(conf);
-    }
-
     List<Content> robotsTxtContent = null;
     if (getConf().getBoolean("fetcher.store.robotstxt", false)) {
       robotsTxtContent = new LinkedList<>();
@@ -373,6 +384,7 @@ public int run(String[] args) {
     try {
 
       BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent);
+      LOG.debug("Robots.txt rules:\n{}", rules);
 
       if (robotsTxtContent != null) {
         for (Content robotsTxt : robotsTxtContent) {
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index db09a0c880..8d7263e3ea 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -17,12 +17,15 @@
 package org.apache.nutch.protocol.http.api;
 
 import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
+import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
@@ -87,6 +90,13 @@ protected static String getCacheKey(URL url) {
    * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
    * rules are cached to avoid re-fetching and re-parsing it again.
    * 
+   * <p>Following
+   * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.3.1.2">RFC
+   * 9309, section 2.3.1.2. Redirects</a>, up to five consecutive HTTP redirects
+   * are followed when fetching the robots.txt file. The max. number of
+   * redirects followed is configurable by the property
+   * <code>http.robots.redirect.max</code>.</p>
+   * 
    * @param http
    *          The {@link Protocol} object
    * @param url
@@ -114,11 +124,11 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
     if (robotRules != null) {
       return robotRules; // cached rule
     } else if (LOG.isTraceEnabled()) {
-      LOG.trace("cache miss {}", url);
+      LOG.trace("Robots.txt cache miss {}", url);
     }
 
     boolean cacheRule = true;
-    URL redir = null;
+    Set<String> redirectCacheKeys = new HashSet<>();
 
     if (isAllowListed(url)) {
       // check in advance whether a host is allowlisted
@@ -129,43 +139,97 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
           url.getHost());
 
     } else {
+      URL robotsUrl = null, robotsUrlRedir = null;
       try {
-        URL robotsUrl = new URL(url, "/robots.txt");
+        robotsUrl = new URL(url, "/robots.txt");
+
+        /*
+         * Redirect counter - following redirects up to the configured maximum
+         * ("five consecutive redirects" as per RFC 9309).
+         */
+        int numRedirects = 0;
+        /*
+         * The base URL to resolve relative redirect locations is set initially
+         * to the default URL path ("/robots.txt") and updated when redirects
+         * were followed.
+         */
+        robotsUrlRedir = robotsUrl;
+
         Response response = ((HttpBase) http).getResponse(robotsUrl,
             new CrawlDatum(), true);
+        int code = response.getCode();
         if (robotsTxtContent != null) {
           addRobotsContent(robotsTxtContent, robotsUrl, response);
         }
-        // try one level of redirection ?
-        if (response.getCode() == 301 || response.getCode() == 302) {
-          String redirection = response.getHeader("Location");
-          if (redirection == null) {
-            // some versions of MS IIS are known to mangle this header
-            redirection = response.getHeader("location");
+
+        while (isRedirect(code) && numRedirects < maxNumRedirects) {
+          numRedirects++;
+
+          String redirectionLocation = response.getHeader("Location");
+          if (StringUtils.isNotBlank(redirectionLocation)) {
+            LOG.debug("Following robots.txt redirect: {} -> {}", robotsUrlRedir,
+                redirectionLocation);
+            try {
+              robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation);
+            } catch (MalformedURLException e) {
+              LOG.info(
+                  "Failed to resolve redirect location for robots.txt: {} -> {} ({})",
+                  robotsUrlRedir, redirectionLocation, e.getMessage());
+              break;
+            }
+            response = ((HttpBase) http).getResponse(robotsUrlRedir,
+                new CrawlDatum(), true);
+            code = response.getCode();
+            if (robotsTxtContent != null) {
+              addRobotsContent(robotsTxtContent, robotsUrlRedir, response);
+            }
+          } else {
+            LOG.info(
+                "No HTTP redirect Location header for robots.txt: {} (status code: {})",
+                robotsUrlRedir, code);
+            break;
           }
-          if (redirection != null) {
-            if (!redirection.startsWith("http")) {
-              // RFC says it should be absolute, but apparently it isn't
-              redir = new URL(url, redirection);
+
+          if ("/robots.txt".equals(robotsUrlRedir.getFile())) {
+            /*
+             * If a redirect points to a path /robots.txt on a different host
+             * (or a different authority scheme://host:port/, in general), we
+             * can lookup the cache for cached rules from the target host.
+             */
+            String redirectCacheKey = getCacheKey(robotsUrlRedir);
+            robotRules = CACHE.get(redirectCacheKey);
+            LOG.debug(
+                "Found cached robots.txt rules for {} (redirected to {}) under target key {}",
+                url, robotsUrlRedir, redirectCacheKey);
+            if (robotRules != null) {
+              /* If found, cache and return the rules for the source host. */
+              CACHE.put(cacheKey, robotRules);
+              return robotRules;
             } else {
-              redir = new URL(redirection);
+              /*
+               * Remember the target host/authority, we can cache the rules,
+               * too.
+               */
+              redirectCacheKeys.add(redirectCacheKey);
             }
+          }
 
-            response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
-                true);
-            if (robotsTxtContent != null) {
-              addRobotsContent(robotsTxtContent, redir, response);
-            }
+          if (numRedirects == maxNumRedirects && isRedirect(code)) {
+            LOG.info(
+                "Reached maximum number of robots.txt redirects for {} (assuming no robots.txt, allow all)",
+                url);
           }
         }
 
-        if (response.getCode() == 200) // found rules: parse them
+        LOG.debug("Fetched robots.txt for {} with status code {}", url, code);
+        if (code == 200) // found rules: parse them
           robotRules = parseRules(url.toString(), response.getContent(),
               response.getHeader("Content-Type"), agentNames);
 
-        else if ((response.getCode() == 403) && (!allowForbidden))
+        else if ((code == 403) && (!allowForbidden))
           robotRules = FORBID_ALL_RULES; // use forbid all
-        else if (response.getCode() >= 500) {
+
+        else if (code >= 500) {
           cacheRule = false; // try again later to fetch robots.txt
           if (deferVisits503) {
             // signal fetcher to suspend crawling for this host
@@ -177,8 +241,15 @@ else if (response.getCode() >= 500) {
           robotRules = EMPTY_RULES; // use default rules
         }
       } catch (Throwable t) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        if (robotsUrl == null || robotsUrlRedir == null) {
+          LOG.info("Couldn't get robots.txt for {}", url, t);
+        } else if (robotsUrl.equals(robotsUrlRedir)) {
+          LOG.info("Couldn't get robots.txt for {} ({}): {}", url, robotsUrl,
+              t);
+        } else {
+          LOG.info(
+              "Couldn't get redirected robots.txt for {} (redirected to {}): {}",
+              url, robotsUrlRedir, t);
         }
         cacheRule = false; // try again later to fetch robots.txt
         robotRules = EMPTY_RULES;
@@ -187,17 +258,27 @@ else if (response.getCode() >= 500) {
 
     if (cacheRule) {
       CACHE.put(cacheKey, robotRules); // cache rules for host
-      if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())
-          && "/robots.txt".equals(redir.getFile())) {
-        // cache also for the redirected host
-        // if the URL path is /robots.txt
-        CACHE.put(getCacheKey(redir), robotRules);
+      for (String redirectCacheKey : redirectCacheKeys) {
+        /*
+         * and also for redirect target hosts where URL path and query were
+         * found to be "/robots.txt"
+         */
+        CACHE.put(redirectCacheKey, robotRules);
       }
     }
 
     return robotRules;
   }
 
+  /**
+   * @param code
+   *          HTTP response status code
+   * @return whether the status code signals a redirect to a different location
+   */
+  private boolean isRedirect(int code) {
+    return (code == 301 || code == 302 || code == 303 || code == 307 || code == 308);
+  }
+
   /**
    * Append {@link Content} of robots.txt to {@literal robotsTxtContent}
    * 

From b081c75d87be61e42297c952298b72eb7ff2a6dc Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Sun, 1 Oct 2023 14:08:39 +0200
Subject: [PATCH 25/28] NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too
 Many Requests same as server errors (HTTP 5xx)

---
 conf/nutch-default.xml                                | 11 ++++++-----
 .../nutch/protocol/http/api/HttpRobotRulesParser.java |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 18ed56b037..d8bf76486c 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -141,8 +141,9 @@
   <name>http.robots.503.defer.visits</name>
   <value>true</value>
   <description>Temporarily suspend fetching from a host if the
-  robots.txt response is HTTP 503 or any other 5xx server error. See
-  also http.robots.503.defer.visits.delay and
+  robots.txt response is HTTP 503 or any other 5xx server error
+  and HTTP 429 Too Many Requests. See also
+  http.robots.503.defer.visits.delay and
   http.robots.503.defer.visits.retries</description>
 </property>
 
@@ -150,7 +151,7 @@
   <name>http.robots.503.defer.visits.delay</name>
   <value>300000</value>
   <description>Time in milliseconds to suspend crawling a host if the
-  robots.txt response is HTTP 5xx - see
+  robots.txt response is HTTP 5xx or 429 Too Many Requests - see
   http.robots.503.defer.visits.</description>
 </property>
 
@@ -158,8 +159,8 @@
   <name>http.robots.503.defer.visits.retries</name>
   <value>3</value>
   <description>Number of retries crawling a host if the robots.txt
-  response is HTTP 5xx - see http.robots.503.defer.visits. After n
-  retries the host queue is dropped for this segment/cycle.
+  response is HTTP 5xx or 429 - see http.robots.503.defer.visits.
+  After n retries the host queue is dropped for this segment/cycle.
   </description>
 </property>
 
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index 8d7263e3ea..ec5e77e433 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -229,7 +229,8 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
         else if ((code == 403) && (!allowForbidden))
           robotRules = FORBID_ALL_RULES; // use forbid all
 
-        else if (code >= 500) {
+        else if (code >= 500 || code == 429) {
+          // 5xx server errors or 429 Too Many Requests
           cacheRule = false; // try again later to fetch robots.txt
           if (deferVisits503) {
             // signal fetcher to suspend crawling for this host

From d2c3e96d88818d8107f320c49e007329b020e090 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Mon, 9 Oct 2023 10:21:01 +0200
Subject: [PATCH 26/28] NUTCH-3012 SegmentReader when dumping with option
 -recode: NPE on unparsed documents - fall back to UTF-8 when stringifying the
 content of unparsed documents

---
 src/java/org/apache/nutch/segment/SegmentReader.java | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java
index 14546af543..ee5c266fd0 100644
--- a/src/java/org/apache/nutch/segment/SegmentReader.java
+++ b/src/java/org/apache/nutch/segment/SegmentReader.java
@@ -163,13 +163,16 @@ public void reduce(Text key, Iterable<NutchWritable> values,
       dump.append("\nRecno:: ").append(recNo++).append("\n");
       dump.append("URL:: " + key.toString() + "\n");
       Content content = null;
-      Charset charset = null;
+      // fall-back encoding for content of unparsed documents
+      Charset charset = StandardCharsets.UTF_8;
       for (NutchWritable val : values) {
         Writable value = val.get(); // unwrap
         if (value instanceof CrawlDatum) {
           dump.append("\nCrawlDatum::\n").append(((CrawlDatum) value).toString());
         } else if (value instanceof Content) {
           if (recodeContent) {
+            // output recoded content later when charset is extracted from HTML
+            // metadata hold in ParseData
             content = (Content) value;
           } else {
             dump.append("\nContent::\n").append(((Content) value).toString());

From 8431dcfe52f5395a0fd9e3c00db009dbb2bcf6f5 Mon Sep 17 00:00:00 2001
From: Lewis John McGibbney <lewis.mcgibbney@gmail.com>
Date: Sat, 21 Oct 2023 11:09:31 -0700
Subject: [PATCH 27/28] NUTCH-3013 Employ commons-lang3's StopWatch to simplify
 timing logic (#788)

---
 .github/workflows/master-build.yml            |  1 -
 .gitignore                                    |  1 +
 src/java/org/apache/nutch/crawl/CrawlDb.java  | 19 ++++++++-------
 .../org/apache/nutch/crawl/CrawlDbMerger.java | 16 ++++++-------
 .../apache/nutch/crawl/DeduplicationJob.java  | 16 ++++++-------
 .../org/apache/nutch/crawl/Generator.java     | 17 ++++++-------
 src/java/org/apache/nutch/crawl/Injector.java | 16 ++++++-------
 src/java/org/apache/nutch/crawl/LinkDb.java   | 15 ++++++------
 .../org/apache/nutch/crawl/LinkDbMerger.java  | 16 ++++++-------
 .../org/apache/nutch/crawl/LinkDbReader.java  | 24 +++++++++----------
 .../org/apache/nutch/fetcher/Fetcher.java     | 17 ++++++-------
 .../org/apache/nutch/hostdb/ReadHostDb.java   | 15 ++++++------
 .../org/apache/nutch/hostdb/UpdateHostDb.java | 16 ++++++-------
 .../org/apache/nutch/indexer/CleaningJob.java | 16 ++++++-------
 .../org/apache/nutch/indexer/IndexingJob.java | 16 ++++++-------
 .../org/apache/nutch/parse/ParseSegment.java  | 21 +++++++---------
 .../nutch/scoring/webgraph/LinkDumper.java    | 17 ++++++-------
 .../nutch/scoring/webgraph/LinkRank.java      | 16 ++++++-------
 .../nutch/scoring/webgraph/NodeDumper.java    | 16 ++++++-------
 .../nutch/scoring/webgraph/ScoreUpdater.java  | 16 ++++++-------
 .../nutch/scoring/webgraph/WebGraph.java      | 24 +++++++++----------
 .../org/apache/nutch/tools/FreeGenerator.java | 16 ++++++-------
 .../nutch/tools/arc/ArcSegmentCreator.java    | 16 ++++++-------
 .../apache/nutch/tools/warc/WARCExporter.java | 15 ++++++------
 .../nutch/util/CrawlCompletionStats.java      | 15 ++++++------
 .../nutch/util/ProtocolStatusStatistics.java  | 19 +++++++--------
 .../apache/nutch/util/SitemapProcessor.java   | 12 ++++++----
 .../nutch/util/domain/DomainStatistics.java   | 16 ++++++-------
 .../urlfilter/api/RegexURLFilterBaseTest.java | 11 +++++----
 .../regex/TestRegexURLNormalizer.java         |  8 +++++--
 30 files changed, 234 insertions(+), 225 deletions(-)

diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml
index e3ed11c869..ba1d470ece 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -22,7 +22,6 @@ on:
     branches: [ master ]
   pull_request:
     branches: [ master ]
-        
 
 jobs:
   build:
diff --git a/.gitignore b/.gitignore
index 0612a99c23..b466908527 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ naivebayes-model
 csvindexwriter
 lib/spotbugs-*
 ivy/dependency-check-ant/*
+.gradle*
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 3819bb3a01..16394832bf 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -19,14 +19,15 @@
 import java.io.File;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -49,7 +50,6 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
 
 /**
  * This class takes the output of the fetcher and updates the crawldb
@@ -85,10 +85,11 @@ public void update(Path crawlDb, Path[] segments, boolean normalize,
   public void update(Path crawlDb, Path[] segments, boolean normalize,
       boolean filter, boolean additionsAllowed, boolean force)
       throws IOException, InterruptedException, ClassNotFoundException {
-    Path lock = lock(getConf(), crawlDb, force);
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+
+    Path lock = lock(getConf(), crawlDb, force);
 
     Job job = CrawlDb.createJob(getConf(), crawlDb);
     Configuration conf = job.getConfiguration();
@@ -98,7 +99,7 @@ public void update(Path crawlDb, Path[] segments, boolean normalize,
 
     boolean url404Purging = conf.getBoolean(CRAWLDB_PURGE_404, false);
 
-    LOG.info("CrawlDb update: starting at {}", sdf.format(start));
+    LOG.info("CrawlDb update: starting");
     LOG.info("CrawlDb update: db: {}", crawlDb);
     LOG.info("CrawlDb update: segments: {}", Arrays.asList(segments));
     LOG.info("CrawlDb update: additions allowed: {}", additionsAllowed);
@@ -151,9 +152,9 @@ public void update(Path crawlDb, Path[] segments, boolean normalize,
           urlsFiltered);
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("CrawlDb update: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   /*
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
index 70c65135ec..1bf7243d38 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -18,11 +18,12 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Map.Entry;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -44,7 +45,6 @@
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 
 /**
  * This tool merges several CrawlDb-s into one, optionally filtering URLs
@@ -129,9 +129,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
       throws Exception {
     Path lock = CrawlDb.lock(getConf(), output, false);
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("CrawlDb merge: starting at {}", sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("CrawlDb merge: starting");
 
     Job job = createMergeJob(getConf(), output, normalize, filter);
     for (int i = 0; i < dbs.length; i++) {
@@ -155,9 +155,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
       NutchJob.cleanupAfterFailure(outPath, lock, fs);
       throw e;
     }
-    long end = System.currentTimeMillis();
-    LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("CrawlDb merge: finished, elapsed: {}", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static Job createMergeJob(Configuration conf, Path output,
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index ae5ac37ce0..217005d415 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -21,11 +21,12 @@
 import java.lang.invoke.MethodHandles;
 import java.net.URLDecoder;
 import java.nio.charset.StandardCharsets;
-import java.text.SimpleDateFormat;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -48,7 +49,6 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -298,9 +298,9 @@ public int run(String[] args) throws IOException {
       }
     }
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("DeduplicationJob: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("DeduplicationJob: starting");
 
     Path tempDir = new Path(crawlDb, "dedup-temp-"
         + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -381,9 +381,9 @@ public int run(String[] args) throws IOException {
     // clean up
     fs.delete(tempDir, true);
 
-    long end = System.currentTimeMillis();
-    LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("Deduplication finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
 
     return 0;
   }
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index d1569e1f03..1b62314e7a 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -30,7 +30,9 @@
 import java.util.Locale;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configurable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -76,7 +78,6 @@
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.SegmentReaderUtil;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 
 /**
@@ -821,10 +822,10 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
 
     Path lock = CrawlDb.lock(getConf(), dbDir, force);
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("Generator: starting at " + sdf.format(start));
-    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("Generator: starting");
+    LOG.info("Generator: selecting best-scoring urls due for fetch.");
     LOG.info("Generator: filtering: " + filter);
     LOG.info("Generator: normalizing: " + norm);
     if (topN != Long.MAX_VALUE) {
@@ -982,9 +983,9 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
     }
     fs.delete(tempDir, true);
 
-    long end = System.currentTimeMillis();
-    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("Generator: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
 
     Path[] patharray = new Path[generatedSegments.size()];
     return generatedSegments.toArray(patharray);
diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java
index 9fca719f62..9bfd1b4547 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.crawl;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -45,17 +46,16 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
 /**
  * Injector takes a flat text file of URLs (or a folder containing text files)
@@ -372,10 +372,11 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
       boolean update, boolean normalize, boolean filter,
       boolean filterNormalizeAll)
       throws IOException, ClassNotFoundException, InterruptedException {
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
 
-    LOG.info("Injector: starting at {}", sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+
+    LOG.info("Injector: starting");
     LOG.info("Injector: crawlDb: {}", crawlDb);
     LOG.info("Injector: urlDir: {}", urlDir);
     LOG.info("Injector: Converting injected urls to crawl db entries.");
@@ -479,9 +480,8 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
               urlsPurged404);
         }
 
-        long end = System.currentTimeMillis();
-        LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end),
-            TimingUtil.elapsedTime(start, end));
+        stopWatch.stop();
+        LOG.info("Injector: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS));
       }
     } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) {
       LOG.error("Injector job failed: {}", e.getMessage());
diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java
index 2b3d2ed907..3c752ab1db 100644
--- a/src/java/org/apache/nutch/crawl/LinkDb.java
+++ b/src/java/org/apache/nutch/crawl/LinkDb.java
@@ -21,13 +21,14 @@
 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -54,7 +55,6 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
 
 /** Maintains an inverted link map, listing incoming links for each url. */
 public class LinkDb extends NutchTool implements Tool {
@@ -196,9 +196,9 @@ public void invert(Path linkDb, Path[] segments, boolean normalize,
     Path currentLinkDb = new Path(linkDb, CURRENT_NAME);
     Configuration conf = job.getConfiguration();
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("LinkDb: starting at {}", sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("LinkDb: starting");
     LOG.info("LinkDb: linkdb: {}", linkDb);
     LOG.info("LinkDb: URL normalize: {}", normalize);
     LOG.info("LinkDb: URL filter: {}", filter);
@@ -260,8 +260,9 @@ public void invert(Path linkDb, Path[] segments, boolean normalize,
     }
     LinkDb.install(job, linkDb);
 
-    long end = System.currentTimeMillis();
-    LOG.info("LinkDb: finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("LinkDb: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   private static Job createJob(Configuration config, Path linkDb,
diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
index f696c599e8..d6a41ab48c 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
@@ -18,11 +18,12 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -41,7 +42,6 @@
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 
 /**
  * This tool merges several LinkDb-s into one, optionally filtering URLs through
@@ -112,9 +112,9 @@ public void reduce(Text key, Iterable<Inlinks> values, Context context)
 
   public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
       throws Exception {
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("LinkDb merge: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("LinkDb merge: starting");
 
     Job job = createMergeJob(getConf(), output, normalize, filter);
     for (int i = 0; i < dbs.length; i++) {
@@ -137,9 +137,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
     fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
         LinkDb.CURRENT_NAME));
 
-    long end = System.currentTimeMillis();
-    LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("LinkDb merge: finished, elapsed: {} ms" + stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static Job createMergeJob(Configuration config, Path linkDb,
diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java
index c307b985d5..fa01f20bf3 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbReader.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -16,13 +16,15 @@
  */
 package org.apache.nutch.crawl;
 
+import java.io.Closeable;
 import java.io.IOException;
-
 import java.lang.invoke.MethodHandles;
+import java.util.concurrent.TimeUnit;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.Iterator;
 
-// Commons Logging imports
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -46,11 +48,8 @@
 import org.apache.nutch.util.AbstractChecker;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 
-import java.text.SimpleDateFormat;
-import java.util.Iterator;
-import java.io.Closeable;
+
 
 /**
  * Read utility for the LinkDb.
@@ -153,10 +152,9 @@ public void map(Text key, Inlinks value, Context context)
 
   public void processDumpJob(String linkdb, String output, String regex) 
     throws IOException, InterruptedException, ClassNotFoundException {
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-
-    LOG.info("LinkDb dump: starting at {}", sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("LinkDb dump: starting");
     LOG.info("LinkDb dump: db: {}", linkdb);
 
     Path outFolder = new Path(output);
@@ -192,9 +190,9 @@ public void processDumpJob(String linkdb, String output, String regex)
       throw e;
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("LinkDb dump: finished at {}, elapsed: {}",
-            sdf.format(end), TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("LinkDb dump: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   @Override
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 3727dcebef..92aef6f106 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -25,9 +25,11 @@
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
@@ -454,11 +456,10 @@ public void fetch(Path segment, int threads) throws IOException,
 
     checkConfiguration();
 
-    long start = System.currentTimeMillis();
-    if (LOG.isInfoEnabled()) {
-      LOG.info("Fetcher: starting at {}", TimingUtil.logDateMillis(start));
-      LOG.info("Fetcher: segment: {}", segment);
-    }
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("Fetcher: starting");
+    LOG.info("Fetcher: segment: {}", segment);
 
     // set the actual time for the timelimit relative
     // to the beginning of the whole job and not of a specific task
@@ -530,9 +531,9 @@ public void fetch(Path segment, int threads) throws IOException,
       throw e;
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("Fetcher: finished at {}, elapsed: {}",
-        TimingUtil.logDateMillis(end), TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   /**
diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index ffddb18898..0321a8652c 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -18,9 +18,10 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.Map;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -42,7 +43,6 @@
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.SegmentReaderUtil;
 
 import org.apache.commons.jexl3.JexlBuilder;
@@ -168,9 +168,9 @@ public void map(Text key, HostDatum datum, Context context) throws IOException,
 //   }
 
   private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean dumpHostnames, String expr) throws Exception {
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("ReadHostDb: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("ReadHostDb: starting");
 
     Configuration conf = getConf();
     conf.setBoolean(HOSTDB_DUMP_HOMEPAGES, dumpHomepages);
@@ -211,8 +211,9 @@ private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean
       throw e;
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("ReadHostDb: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
   
   private void getHostDbRecord(Path hostDb, String host) throws Exception {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
index ffa68d0963..65e45c55d8 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
@@ -17,9 +17,10 @@
 package org.apache.nutch.hostdb;
 
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.Path;
@@ -40,7 +41,6 @@
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -73,9 +73,9 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
     boolean checkFailed, boolean checkNew, boolean checkKnown,
     boolean force, boolean filter, boolean normalize) throws Exception {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("UpdateHostDb: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("UpdateHostDb: starting");
 
     Job job = NutchJob.getInstance(getConf());
     Configuration conf = job.getConfiguration();
@@ -149,9 +149,9 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
     }
 
     LockUtil.removeLockFile(fs, lock);
-    long end = System.currentTimeMillis();
-    LOG.info("UpdateHostDb: finished at " + sdf.format(end) +
-      ", elapsed: " + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("UpdateHostDb: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static void main(String args[]) throws Exception {
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java
index dc3ed69e4a..04b9c2efa5 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -18,7 +18,9 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.ByteWritable;
@@ -36,7 +38,6 @@
 import org.apache.nutch.crawl.CrawlDb;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -139,9 +140,9 @@ public void reduce(ByteWritable key, Iterable<Text> values,
 
   public void delete(String crawldb, boolean noCommit) 
     throws IOException, InterruptedException, ClassNotFoundException {
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("CleaningJob: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("CleaningJob: starting");
 
     Job job = NutchJob.getInstance(getConf());
     Configuration conf = job.getConfiguration();
@@ -173,9 +174,8 @@ public void delete(String crawldb, boolean noCommit)
       throw e;
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("CleaningJob: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS));
   }
 
   @Override
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java
index ff46bc0eff..d2115230c8 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -19,7 +19,6 @@
 import java.io.File;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -27,7 +26,9 @@
 import java.util.Locale;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.segment.SegmentChecker;
 import org.apache.hadoop.conf.Configuration;
@@ -44,7 +45,6 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -104,9 +104,9 @@ public void index(Path crawlDb, Path linkDb, List<Path> segments,
       boolean filter, boolean normalize, boolean addBinaryContent,
       boolean base64) throws IOException, InterruptedException, ClassNotFoundException {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("Indexer: starting at {}", sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("Indexer: starting");
 
     final Job job = NutchJob.getInstance(getConf());
     job.setJobName("Indexer");
@@ -159,9 +159,9 @@ public void index(Path crawlDb, Path linkDb, List<Path> segments,
             String.format(Locale.ROOT, "%6d", counter.getValue()),
             counter.getName());
       }
-      long end = System.currentTimeMillis();
-      LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: "
-          + TimingUtil.elapsedTime(start, end));
+      stopWatch.stop();
+      LOG.info("Indexer: finished, elapsed: {} ms", stopWatch.getTime(
+          TimeUnit.MILLISECONDS));
     } finally {
       tmp.getFileSystem(conf).delete(tmp, true);
     }
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
index c4e271feec..de45c463b9 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.parse;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -25,7 +26,6 @@
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
@@ -50,13 +50,12 @@
 import java.io.File;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.concurrent.TimeUnit;
 
 /* Parse content in a segment. */
 public class ParseSegment extends NutchTool implements Tool {
@@ -228,12 +227,10 @@ public void parse(Path segment) throws IOException,
       return;
     }
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    if (LOG.isInfoEnabled()) {
-      LOG.info("ParseSegment: starting at {}", sdf.format(start));
-      LOG.info("ParseSegment: segment: {}", segment);
-    }
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("ParseSegment: starting");
+    LOG.info("ParseSegment: segment: {}", segment);
 
     Job job = NutchJob.getInstance(getConf());
     job.setJobName("parse " + segment);
@@ -263,9 +260,9 @@ public void parse(Path segment) throws IOException,
       throw e;
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("ParseSegment: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
index 54cd8b8ed1..4831d73f38 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
@@ -20,10 +20,11 @@
 import java.io.DataOutput;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.GnuParser;
@@ -31,6 +32,7 @@
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -57,7 +59,6 @@
 import org.apache.nutch.util.FSUtils;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 
 /**
  * The LinkDumper tool creates a database of node to inlink information that can
@@ -327,9 +328,9 @@ public void reduce(Text key, Iterable<LinkNode> values,
   public void dumpLinks(Path webGraphDb) throws IOException, 
       InterruptedException, ClassNotFoundException {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("NodeDumper: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("NodeDumper: starting");
     Configuration conf = getConf();
     FileSystem fs = webGraphDb.getFileSystem(conf);
 
@@ -400,9 +401,9 @@ public void dumpLinks(Path webGraphDb) throws IOException,
     }
 
     fs.delete(tempInverted, true);
-    long end = System.currentTimeMillis();
-    LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("LinkDumper: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
index 739fe6cec1..c226ad130b 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
@@ -21,12 +21,12 @@
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Random;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -35,6 +35,7 @@
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -65,7 +66,6 @@
 import org.apache.nutch.util.FSUtils;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 
 public class LinkRank extends Configured implements Tool {
@@ -651,9 +651,9 @@ public LinkRank(Configuration conf) {
   public void analyze(Path webGraphDb) throws IOException, 
       ClassNotFoundException, InterruptedException {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("Analysis: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("LinkRank Analysis: starting");
 
     // store the link rank under the webgraphdb temporarily, final scores get
     // upddated into the nodedb
@@ -714,9 +714,9 @@ public void analyze(Path webGraphDb) throws IOException,
 
     // remove the temporary link rank folder
     fs.delete(linkRank, true);
-    long end = System.currentTimeMillis();
-    LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("LinkRank Analysis: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
index ede9fa1c59..dfccccc19e 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
@@ -18,7 +18,7 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -27,6 +27,7 @@
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -48,7 +49,6 @@
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 
 /**
@@ -293,9 +293,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output,
       boolean asEff, NameType nameType, AggrType aggrType,
       boolean asSequenceFile) throws Exception {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("NodeDumper: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("NodeDumper: starting");
     Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
 
     Job dumper = NutchJob.getInstance(getConf());
@@ -357,9 +357,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output,
       LOG.error("NodeDumper job failed:", e);
       throw e;
     }
-    long end = System.currentTimeMillis();
-    LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("NodeDumper: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
index 130e1b2a1c..c10a6e37b0 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
@@ -18,8 +18,8 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -28,6 +28,7 @@
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -51,7 +52,6 @@
 import org.apache.nutch.crawl.CrawlDb;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 
 /**
  * Updates the score from the WebGraph node database into the crawl database.
@@ -156,9 +156,9 @@ public void reduce(Text key, Iterable<ObjectWritable> values,
   public void update(Path crawlDb, Path webGraphDb) throws IOException,
       ClassNotFoundException, InterruptedException {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("ScoreUpdater: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("ScoreUpdater: starting");
 
     Configuration conf = getConf();
 
@@ -213,9 +213,9 @@ public void update(Path crawlDb, Path webGraphDb) throws IOException,
     LOG.info("ScoreUpdater: installing new crawldb " + crawlDb);
     CrawlDb.install(updater, crawlDb);
 
-    long end = System.currentTimeMillis();
-    LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("ScoreUpdater: finished, elapsed: {} ms ", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
index 63d0ead7da..b98329d1e0 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -18,7 +18,6 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
@@ -26,6 +25,7 @@
 import java.util.Map;
 import java.util.Random;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -34,6 +34,7 @@
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -67,7 +68,6 @@
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 
 /**
@@ -518,14 +518,12 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
       boolean normalize, boolean filter) throws IOException, 
       InterruptedException, ClassNotFoundException {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    if (LOG.isInfoEnabled()) {
-      LOG.info("WebGraphDb: starting at " + sdf.format(start));
-      LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
-      LOG.info("WebGraphDb: URL normalize: " + normalize);
-      LOG.info("WebGraphDb: URL filter: " + filter);
-    }
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("WebGraphDb: starting");
+    LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
+    LOG.info("WebGraphDb: URL normalize: " + normalize);
+    LOG.info("WebGraphDb: URL filter: " + filter);
 
     FileSystem fs = webGraphDb.getFileSystem(getConf());
 
@@ -715,9 +713,9 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
     // remove the lock file for the webgraph
     LockUtil.removeLockFile(fs, lock);
 
-    long end = System.currentTimeMillis();
-    LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("WebGraphDb: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java
index 039bccaece..e9f5c87619 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java
@@ -18,10 +18,11 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.HashMap;
 import java.util.Map.Entry;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.fs.Path;
@@ -47,7 +48,6 @@
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 
 /**
  * This tool generates fetchlists (segments to be fetched) from plain text files
@@ -180,9 +180,9 @@ public int run(String[] args) throws Exception {
       }
     }
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("FreeGenerator: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("FreeGenerator: starting");
 
     Job job = NutchJob.getInstance(getConf());
     Configuration conf = job.getConfiguration();
@@ -226,9 +226,9 @@ public int run(String[] args) throws Exception {
       LOG.error("FAILED: " + StringUtils.stringifyException(e));
       return -1;
     }
-    long end = System.currentTimeMillis();
-    LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("FreeGenerator: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
     return 0;
   }
 
diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
index 4e916dbd50..825e752cc0 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
@@ -21,7 +21,9 @@
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.Map.Entry;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -56,7 +58,6 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.TimingUtil;
 
 /**
  * <p>
@@ -368,10 +369,10 @@ public void map(Text key, BytesWritable bytes,
   public void createSegments(Path arcFiles, Path segmentsOutDir)
       throws IOException, InterruptedException, ClassNotFoundException {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
     if (LOG.isInfoEnabled()) {
-      LOG.info("ArcSegmentCreator: starting at " + sdf.format(start));
+      LOG.info("ArcSegmentCreator: starting");
       LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
     }
 
@@ -402,10 +403,9 @@ public void createSegments(Path arcFiles, Path segmentsOutDir)
       throw e;
     }
 
-
-    long end = System.currentTimeMillis();
-    LOG.info("ArcSegmentCreator: finished at " + sdf.format(end)
-        + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("ArcSegmentCreator: finished, elapsed: {} ms" + stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
   }
 
   public static void main(String args[]) throws Exception {
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index cf000ba526..6d8a385572 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -29,8 +29,10 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.UUID;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileStatus;
@@ -58,7 +60,6 @@
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -428,9 +429,9 @@ protected JsonObject metadataToJson(Metadata meta) {
   public int generateWARC(String output, List<Path> segments,
       boolean onlySuccessfulResponses, boolean includeParseData,
       boolean includeParseText) throws IOException {
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("WARCExporter: starting at {}", sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("WARCExporter: starting");
 
     final Job job = NutchJob.getInstance(getConf());
     job.setJobName("warc-exporter " + output);
@@ -479,9 +480,9 @@ public int generateWARC(String output, List<Path> segments,
         throw new RuntimeException(message);
       }
       LOG.info(job.getCounters().toString());
-      long end = System.currentTimeMillis();
-      LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end),
-          TimingUtil.elapsedTime(start, end));
+      stopWatch.stop();
+      LOG.info("WARCExporter: finished, elapsed: {} ms", stopWatch.getTime(
+          TimeUnit.MILLISECONDS));
     } catch (IOException | InterruptedException | ClassNotFoundException e) {
       LOG.error("WARCExporter job failed: {}", e.getMessage());
       return -1;
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index 7210ee83af..8696d28221 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -20,7 +20,7 @@
 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -30,6 +30,7 @@
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.Path;
@@ -127,9 +128,9 @@ public int run(String[] args) throws Exception {
       numOfReducers = Integer.parseInt(args[3]);
     }
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("CrawlCompletionStats: starting");
 
     int mode = 0;
     String jobName = "CrawlCompletionStats";
@@ -180,9 +181,9 @@ public int run(String[] args) throws Exception {
       throw e;
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}",
-      sdf.format(end), TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("CrawlCompletionStats: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
     return 0;
   }
 
diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
index 2499da0bfb..0fe6c57d03 100644
--- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
+++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -16,10 +16,11 @@
  */
 package org.apache.nutch.util;
 
-import java.io.File;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.lang3.time.StopWatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -37,8 +38,6 @@
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.metadata.Nutch;
 
 /**
@@ -86,9 +85,9 @@ public int run(String[] args) throws Exception {
       numOfReducers = Integer.parseInt(args[2]);
     }
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("ProtocolStatistics: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("ProtocolStatistics: starting");
 
     String jobName = "ProtocolStatistics";
 
@@ -130,9 +129,9 @@ public int run(String[] args) throws Exception {
       throw e;
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("ProtocolStatistics: finished, elapsed: {} ms", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
     return 0;
   }
 
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 98f7df839d..66fa9b0e7a 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -22,7 +22,9 @@
 import java.util.Collection;
 import java.util.List;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
@@ -359,8 +361,9 @@ else if(sitemapDatum != null) {
 
   public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter,
                       boolean normalize, int threads) throws Exception {
-    long start = System.currentTimeMillis();
-    LOG.info("SitemapProcessor: Starting at {}", sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("SitemapProcessor: starting");
 
     FileSystem fs = crawldb.getFileSystem(getConf());
     Path old = new Path(crawldb, "old");
@@ -441,8 +444,9 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric
         LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);
         LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);
 
-        long end = System.currentTimeMillis();
-        LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end));
+        stopWatch.stop();
+        LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime(
+            TimeUnit.MILLISECONDS));
       }
     } catch (IOException | InterruptedException | ClassNotFoundException e) {
       LOG.error("SitemapProcessor_" + crawldb.toString(), e);
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index 638b6c94f1..f77b72bc5f 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -20,8 +20,9 @@
 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.Path;
@@ -39,7 +40,6 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -92,9 +92,9 @@ public int run(String[] args) throws Exception {
       numOfReducers = Integer.parseInt(args[3]);
     }
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-    long start = System.currentTimeMillis();
-    LOG.info("DomainStatistics: starting at " + sdf.format(start));
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
+    LOG.info("DomainStatistics: starting");
 
     int mode = 0;
     String jobName = "DomainStatistics";
@@ -151,9 +151,9 @@ public int run(String[] args) throws Exception {
       throw e;
     }
 
-    long end = System.currentTimeMillis();
-    LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: "
-        + TimingUtil.elapsedTime(start, end));
+    stopWatch.stop();
+    LOG.info("DomainStatistics: finished, elapsed: {} ms ", stopWatch.getTime(
+        TimeUnit.MILLISECONDS));
     return 0;
   }
 
diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
index c77c67eb17..080b2e5870 100644
--- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
+++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.urlfilter.api;
 
-// JDK imports
 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
 import java.io.FileReader;
@@ -24,12 +23,13 @@
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.time.StopWatch;
 import org.junit.Assert;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-// Nutch imports
 import org.apache.nutch.net.URLFilter;
 
 /**
@@ -58,7 +58,8 @@ protected void bench(int loops, String file) {
   }
 
   protected void bench(int loops, Reader rules, Reader urls) {
-    long start = System.currentTimeMillis();
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
     try {
       URLFilter filter = getURLFilter(rules);
       FilteredURL[] expected = readURLFile(urls);
@@ -68,8 +69,8 @@ protected void bench(int loops, Reader rules, Reader urls) {
     } catch (Exception e) {
       Assert.fail(e.toString());
     }
-    LOG.info("bench time (" + loops + ") "
-        + (System.currentTimeMillis() - start) + "ms");
+    stopWatch.stop();
+    LOG.info("bench time {} loops {} ms", loops, stopWatch.getTime(TimeUnit.MILLISECONDS));
   }
 
   protected void bench(int loops, String rulesFile, String urlsFile) {
diff --git a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
index 1eee7183b7..4952a1da4c 100644
--- a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
+++ b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
@@ -25,11 +25,13 @@
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.*;
+import java.util.concurrent.TimeUnit;
 
 import org.junit.Assert;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.apache.commons.lang3.time.StopWatch;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.util.NutchConfiguration;
@@ -104,7 +106,8 @@ private void normalizeTest(NormalizedURL[] urls, String scope)
   }
 
   private void bench(int loops, String scope) {
-    long start = System.currentTimeMillis();
+    StopWatch stopWatch = new StopWatch();
+    stopWatch.start();
     try {
       NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope);
       if (expected == null)
@@ -115,8 +118,9 @@ private void bench(int loops, String scope) {
     } catch (Exception e) {
       Assert.fail(e.toString());
     }
+    stopWatch.stop();
     LOG.info("bench time (" + loops + ") "
-        + (System.currentTimeMillis() - start) + "ms");
+        + (stopWatch.getTime(TimeUnit.MILLISECONDS)) + "ms");
   }
 
   private static class NormalizedURL {

From 792ed28914f4beb2fb8b8ce28eebe17196c92af1 Mon Sep 17 00:00:00 2001
From: Lewis John McGibbney <lewis.mcgibbney@gmail.com>
Date: Fri, 27 Oct 2023 15:04:22 -0700
Subject: [PATCH 28/28] NUTCH-3015 Add more CI steps to GitHub master-build.yml
 (#790)

---
 .github/workflows/dependency-check.yml        | 37 +++++++++++
 .github/workflows/master-build.yml            | 64 +++++++++++++++----
 .gitignore                                    |  1 +
 build.xml                                     | 52 ++++++++++++---
 .../dependency-check-suppressions.xml         |  5 --
 src/java/overview.html                        | 16 +++++
 .../creativecommons/conf/crawl-urlfilter.txt  | 15 +++++
 .../creativecommons/conf/nutch-site.xml       | 16 +++++
 src/plugin/creativecommons/data/anchor.html   | 16 +++++
 src/plugin/creativecommons/data/rdf.html      | 16 +++++
 src/plugin/creativecommons/data/rel.html      | 16 +++++
 src/plugin/creativecommons/ivy.xml            |  1 -
 src/plugin/exchange-jexl/README.md            | 17 +++++
 src/plugin/exchange-jexl/ivy.xml              |  1 -
 src/plugin/feed/ivy.xml                       |  1 -
 src/plugin/headings/ivy.xml                   |  1 -
 src/plugin/index-anchor/ivy.xml               |  1 -
 src/plugin/index-basic/ivy.xml                |  1 -
 src/plugin/index-geoip/ivy.xml                |  1 -
 src/plugin/index-geoip/plugin.xml             |  1 +
 src/plugin/index-jexl-filter/ivy.xml          |  1 -
 src/plugin/index-links/README.md              | 17 +++++
 src/plugin/index-links/ivy.xml                |  1 -
 src/plugin/index-metadata/ivy.xml             |  1 -
 src/plugin/index-more/ivy.xml                 |  1 -
 src/plugin/index-replace/ivy.xml              |  1 -
 .../sample/testIndexReplace.html              | 16 +++++
 src/plugin/index-static/ivy.xml               |  1 -
 src/plugin/indexer-cloudsearch/README.md      | 17 +++++
 .../indexer-cloudsearch/createCSDomain.sh     | 15 +++++
 src/plugin/indexer-csv/README.md              | 17 +++++
 src/plugin/indexer-csv/ivy.xml                |  1 -
 src/plugin/indexer-dummy/README.md            | 17 +++++
 src/plugin/indexer-dummy/ivy.xml              |  1 -
 src/plugin/indexer-elastic/README.md          | 17 +++++
 ...wto_upgrade_es.txt => howto_upgrade_es.md} | 17 +++++
 src/plugin/indexer-kafka/ivy.xml              |  1 -
 src/plugin/indexer-opensearch-1x/README.md    | 17 +++++
 ...search.txt => howto_upgrade_opensearch.md} | 17 +++++
 src/plugin/indexer-rabbit/README.md           | 17 +++++
 src/plugin/indexer-rabbit/ivy.xml             |  1 -
 src/plugin/indexer-solr/README.md             | 17 +++++
 ...upgrade_solr.txt => howto_upgrade_solr.md} | 17 +++++
 src/plugin/indexer-solr/ivy.xml               | 25 +++++---
 src/plugin/indexer-solr/plugin.xml            | 26 +++++---
 src/plugin/language-identifier/ivy.xml        |  1 -
 src/plugin/lib-htmlunit/ivy.xml               |  1 -
 src/plugin/lib-http/ivy.xml                   |  1 -
 src/plugin/lib-nekohtml/ivy.xml               |  1 -
 src/plugin/lib-rabbitmq/ivy.xml               |  1 -
 src/plugin/lib-regex-filter/ivy.xml           |  1 -
 src/plugin/lib-selenium/README.md             | 17 +++++
 .../lib-selenium/howto_upgrade_selenium.md    | 32 ++++++++++
 .../lib-selenium/howto_upgrade_selenium.txt   | 15 -----
 src/plugin/lib-selenium/ivy.xml               |  1 -
 src/plugin/lib-xml/ivy.xml                    |  1 -
 src/plugin/microformats-reltag/ivy.xml        |  1 -
 src/plugin/mimetype-filter/ivy.xml            |  1 -
 src/plugin/nutch-extensionpoints/ivy.xml      |  1 -
 src/plugin/parse-ext/command                  | 15 +++++
 src/plugin/parse-ext/ivy.xml                  |  1 -
 src/plugin/parse-html/ivy.xml                 |  1 -
 src/plugin/parse-js/ivy.xml                   |  1 -
 .../sample/parse_embedded_js_test.html        | 16 +++++
 .../parse-js/sample/parse_pure_js_test.js     | 15 +++++
 src/plugin/parse-metatags/ivy.xml             |  1 -
 .../parse-metatags/sample/testMetatags.html   | 16 +++++
 .../sample/testMultivalueMetatags.html        | 16 +++++
 ...upgrade_tika.txt => howto_upgrade_tika.md} | 17 +++++
 src/plugin/parse-tika/ivy.xml                 |  1 -
 src/plugin/parse-tika/sample/nutch.html       | 16 +++++
 src/plugin/parse-zip/ivy.xml                  |  1 -
 src/plugin/parsefilter-debug/ivy.xml          |  1 -
 src/plugin/parsefilter-naivebayes/ivy.xml     |  1 -
 .../data/regex-parsefilter.txt                | 15 +++++
 src/plugin/parsefilter-regex/ivy.xml          |  1 -
 src/plugin/protocol-file/ivy.xml              |  1 -
 .../protocol-file/sample/testprotocolfile.txt | 15 +++++
 .../sample/testprotocolfile_(encoded).txt     | 15 +++++
 src/plugin/protocol-foo/ivy.xml               |  1 -
 src/plugin/protocol-foo/plugin.xml            |  1 -
 src/plugin/protocol-ftp/ivy.xml               |  1 -
 src/plugin/protocol-htmlunit/ivy.xml          |  1 -
 src/plugin/protocol-http/ivy.xml              |  1 -
 src/plugin/protocol-httpclient/ivy.xml        |  1 -
 .../protocol-interactiveselenium/README.md    | 17 +++++
 .../protocol-interactiveselenium/ivy.xml      |  1 -
 ...ade_okhttp.txt => howto_upgrade_okhttp.md} | 17 +++++
 src/plugin/protocol-okhttp/ivy.xml            |  1 -
 src/plugin/protocol-selenium/README.md        | 17 +++++
 src/plugin/protocol-selenium/ivy.xml          |  1 -
 src/plugin/publish-rabbitmq/ivy.xml           |  1 -
 src/plugin/scoring-depth/ivy.xml              |  1 -
 src/plugin/scoring-link/ivy.xml               |  1 -
 src/plugin/scoring-metadata/ivy.xml           |  1 -
 src/plugin/scoring-opic/ivy.xml               |  1 -
 src/plugin/scoring-orphan/ivy.xml             |  1 -
 src/plugin/scoring-similarity/ivy.xml         |  1 -
 src/plugin/subcollection/ivy.xml              |  1 -
 src/plugin/tld/ivy.xml                        |  1 -
 src/plugin/urlfilter-automaton/ivy.xml        |  1 -
 src/plugin/urlfilter-domain/data/hosts.txt    | 15 +++++
 src/plugin/urlfilter-domain/ivy.xml           |  1 -
 .../urlfilter-domaindenylist/data/hosts.txt   | 15 +++++
 src/plugin/urlfilter-domaindenylist/ivy.xml   |  1 -
 src/plugin/urlfilter-fast/README.md           | 16 +++++
 src/plugin/urlfilter-fast/ivy.xml             |  1 -
 src/plugin/urlfilter-ignoreexempt/README.md   | 17 +++++
 src/plugin/urlfilter-ignoreexempt/ivy.xml     |  1 -
 src/plugin/urlfilter-prefix/ivy.xml           |  1 -
 src/plugin/urlfilter-regex/ivy.xml            |  1 -
 src/plugin/urlfilter-suffix/ivy.xml           |  1 -
 src/plugin/urlfilter-validator/ivy.xml        |  1 -
 src/plugin/urlmeta/ivy.xml                    |  1 -
 src/plugin/urlnormalizer-ajax/ivy.xml         |  1 -
 src/plugin/urlnormalizer-basic/ivy.xml        |  1 -
 src/plugin/urlnormalizer-host/data/hosts.txt  | 15 +++++
 src/plugin/urlnormalizer-host/ivy.xml         |  1 -
 src/plugin/urlnormalizer-pass/ivy.xml         |  1 -
 .../urlnormalizer-protocol/data/protocols.txt | 15 +++++
 src/plugin/urlnormalizer-protocol/ivy.xml     |  1 -
 src/plugin/urlnormalizer-querystring/ivy.xml  |  1 -
 src/plugin/urlnormalizer-regex/ivy.xml        |  1 -
 .../sample/regex-normalize-default.test       | 15 +++++
 .../sample/regex-normalize-scope1.test        | 15 +++++
 .../urlnormalizer-slash/data/slashes.txt      | 15 +++++
 src/plugin/urlnormalizer-slash/ivy.xml        |  1 -
 src/test/crawl-tests.xml                      | 16 +++++
 src/test/filter-all.txt                       | 15 +++++
 src/test/log4j.properties                     | 15 +++++
 src/test/nutch-site.xml                       | 16 +++++
 .../fetch-test-site/dup_of_pagea.html         | 16 +++++
 .../fetch-test-site/exception.html            | 16 +++++
 src/testresources/fetch-test-site/index.html  | 16 +++++
 .../fetch-test-site/nested_spider_trap.html   | 16 +++++
 src/testresources/fetch-test-site/pagea.html  | 16 +++++
 src/testresources/fetch-test-site/pageb.html  | 16 +++++
 src/testresources/fetch-test-site/robots.txt  | 14 ++++
 138 files changed, 1060 insertions(+), 136 deletions(-)
 create mode 100644 .github/workflows/dependency-check.yml
 rename src/plugin/indexer-elastic/{howto_upgrade_es.txt => howto_upgrade_es.md} (61%)
 rename src/plugin/indexer-opensearch-1x/{howto_upgrade_opensearch.txt => howto_upgrade_opensearch.md} (62%)
 rename src/plugin/indexer-solr/{howto_upgrade_solr.txt => howto_upgrade_solr.md} (60%)
 create mode 100644 src/plugin/lib-selenium/howto_upgrade_selenium.md
 delete mode 100644 src/plugin/lib-selenium/howto_upgrade_selenium.txt
 rename src/plugin/parse-tika/{howto_upgrade_tika.txt => howto_upgrade_tika.md} (73%)
 rename src/plugin/protocol-okhttp/{howto_upgrade_okhttp.txt => howto_upgrade_okhttp.md} (52%)

diff --git a/.github/workflows/dependency-check.yml b/.github/workflows/dependency-check.yml
new file mode 100644
index 0000000000..f07f746a0d
--- /dev/null
+++ b/.github/workflows/dependency-check.yml
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: master pr build
+
+on:
+  schedule:
+    - cron: '0 0 * * *'  # every day at midnight
+
+jobs:
+  dependency-check:
+    strategy:
+      matrix:
+        java: ['11']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up JDK ${{ matrix.java }}
+        uses: actions/setup-java@v3
+        with:
+          java-version: ${{ matrix.java }}
+          distribution: 'temurin'
+      - name: Dependency check
+        run: ant clean dependency-check -buildfile build.xml
diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml
index ba1d470ece..e0af58df06 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -1,4 +1,3 @@
-#
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
@@ -13,28 +12,67 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-name: master pr build
+name: master pull request ci
 
 on:
   push:
-    branches: [ master ]
+    branches: [master]
   pull_request:
-    branches: [ master ]
+    types: [opened, synchronize, reopened]
+    branches: [master]
 
 jobs:
-  build:
-    runs-on: ubuntu-latest
+  javadoc:
     strategy:
       matrix:
-        java: [ '11' ]
-
+        java: ['11']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up JDK ${{ matrix.java }}
+        uses: actions/setup-java@v3
+        with:
+          java-version: ${{ matrix.java }}
+          distribution: 'temurin'
+      - name: Javadoc
+        run: ant clean javadoc -buildfile build.xml
+  rat:
+    strategy:
+      matrix:
+        java: ['11']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up JDK ${{ matrix.java }}
+        uses: actions/setup-java@v3
+        with:
+          java-version: ${{ matrix.java }}
+          distribution: 'temurin'
+      - name: Run Apache Rat
+        run: ant clean run-rat -buildfile build.xml
+      - name: Cache unknown licenses
+        run: echo "UNKNOWN_LICENSES=$(sed -n 18p /home/runner/work/nutch/nutch/build/apache-rat-report.txt)" >> $GITHUB_ENV
+      - name: Versions
+        run: |
+          echo $UNKNOWN_LICENSES
+      - name: Fail if any unknown licenses
+        if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }}
+        run: exit 1
+  test:
+    strategy:
+      matrix:
+        java: ['11']
+        os: [ubuntu-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up JDK ${{ matrix.java }}
-        uses: actions/setup-java@v1
+        uses: actions/setup-java@v3
         with:
           java-version: ${{ matrix.java }}
-      - name: Build with Ant
-        run: ant clean nightly javadoc -buildfile build.xml
+          distribution: 'temurin'
+      - name: Test
+        run: ant clean test -buildfile build.xml
diff --git a/.gitignore b/.gitignore
index b466908527..12365dd0d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ csvindexwriter
 lib/spotbugs-*
 ivy/dependency-check-ant/*
 .gradle*
+ivy/apache-rat-*
diff --git a/build.xml b/build.xml
index b44581405a..dd9797302b 100644
--- a/build.xml
+++ b/build.xml
@@ -38,7 +38,7 @@
   <property name="maven-javadoc-jar" value="${release.dir}/${artifactId}-${version}-javadoc.jar" />
   <property name="maven-sources-jar" value="${release.dir}/${artifactId}-${version}-sources.jar" />
 
-  <property name="dependency-check-ant.version" value="7.1.1" />
+  <property name="dependency-check-ant.version" value="8.4.2" />
   <property name="dependency-check-ant.home" value="${ivy.dir}/dependency-check-ant" />
   <property name="dependency-check-ant.jar" value="${dependency-check-ant.home}/dependency-check-ant.jar" />
 
@@ -48,7 +48,7 @@
   <property name="spotbugs.home" value="${ivy.dir}/spotbugs-${spotbugs.version}" />
   <property name="spotbugs.jar" value="${spotbugs.home}/lib/spotbugs-ant.jar" />
 
-  <property name="apache-rat.version" value="0.14" />
+  <property name="apache-rat.version" value="0.15" />
   <property name="apache-rat.home" value="${ivy.dir}/apache-rat-${apache-rat.version}" />
   <property name="apache-rat.jar" value="${apache-rat.home}/apache-rat-${apache-rat.version}.jar" />
 
@@ -640,13 +640,15 @@
     </fileset>
   </path>
 
-  <target name="report-vulnerabilities" depends="jar, compile-plugins, dependency-check-ant-download" description="--> check dependencies for security vulnerabilities">
+  <target name="dependency-check" depends="jar, compile-plugins, dependency-check-ant-download" description="--> check dependencies for security vulnerabilities">
     <taskdef resource="dependency-check-taskdefs.properties">
       <classpath refid="dependency-check-ant.path" />
     </taskdef>
     <dependency-check projectname="${name}"
                       reportoutputdirectory="${dependency-check-ant.home}"
-                      reportformat="ALL">
+                      reportformat="ALL"
+                      assemblyAnalyzerEnabled="false"
+                      failBuildOnCVSS="1">
         <suppressionfile path="${dependency-check-ant.home}/dependency-check-suppressions.xml" />
         <retirejsFilter regex="copyright.*jeremy long" />
         <fileset dir="${build.dir}">
@@ -1025,7 +1027,7 @@
 
   <target name="apache-rat-download-unchecked" unless="apache-rat.jar.found"
           description="--> downloads the Apache Rat jar">
-    <get src="https://www.apache.org/dist/creadur/apache-rat-${apache-rat.version}/apache-rat-${apache-rat.version}-bin.tar.gz"
+    <get src="https://archive.apache.org/dist/creadur/apache-rat-${apache-rat.version}/apache-rat-${apache-rat.version}-bin.tar.gz"
          dest="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz" usetimestamp="false" />
 
     <untar src="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz"
@@ -1035,8 +1037,8 @@
     <delete file="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz" />
   </target>
 
-  <target name="rat-sources" depends="init, apache-rat-download"
-    description="--> runs RAT tasks over src/java">
+  <target name="run-rat" depends="init, apache-rat-download"
+    description="--> runs Apache Rat on codebase">
     <taskdef
         uri="antlib:org.apache.rat.anttasks"
         resource="org/apache/rat/anttasks/antlib.xml">
@@ -1047,8 +1049,40 @@
     <rat:report
         reportFile="${build.dir}/apache-rat-report.txt">
       <fileset dir="src">
-        <include name="java/**/*"/>
-        <include name="plugin/**/src/**/*"/>
+        <include name="**"/>
+        <exclude name="plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/de.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/en.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/es.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/fi.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/fr.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/it.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/nl.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/pt.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/sv.test"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/test-referencial.txt"/>
+        <exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/da.test"/>
+        <exclude name="plugin/parse-tika/sample/ootest.txt"/>
+        <exclude name="plugin/parse-tika/sample/test.rtf"/>
+        <exclude name="plugin/urlfilter-ignoreexempt/data/.donotdelete"/>
+        <exclude name="plugin/urlfilter-automaton/sample/Benchmarks.rules"/>
+        <exclude name="plugin/urlfilter-automaton/sample/Benchmarks.urls"/>
+        <exclude name="plugin/urlfilter-automaton/sample/IntranetCrawling.rules"/>
+        <exclude name="plugin/urlfilter-automaton/sample/IntranetCrawling.urls"/>
+        <exclude name="plugin/urlfilter-automaton/sample/WholeWebCrawling.rules"/>
+        <exclude name="plugin/urlfilter-automaton/sample/WholeWebCrawling.urls"/>
+        <exclude name="plugin/urlfilter-fast/sample/Benchmarks.urls"/>
+        <exclude name="plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt"/>
+        <exclude name="plugin/urlfilter-fast/sample/fast-urlfilter-test.txt"/>
+        <exclude name="plugin/urlfilter-fast/sample/test.urls"/>
+        <exclude name="plugin/urlfilter-regex/sample/Benchmarks.rules"/>
+        <exclude name="plugin/urlfilter-regex/sample/Benchmarks.urls"/>
+        <exclude name="plugin/urlfilter-regex/sample/IntranetCrawling.rules"/>
+        <exclude name="plugin/urlfilter-regex/sample/IntranetCrawling.urls"/>
+        <exclude name="plugin/urlfilter-regex/sample/WholeWebCrawling.rules"/>
+        <exclude name="plugin/urlfilter-regex/sample/WholeWebCrawling.urls"/>
+        <exclude name="plugin/urlfilter-regex/sample/nutch1838.rules"/>
+        <exclude name="plugin/urlfilter-regex/sample/nutch1838.urls"/>
       </fileset>
     </rat:report>
   </target>
diff --git a/ivy/dependency-check-ant/dependency-check-suppressions.xml b/ivy/dependency-check-ant/dependency-check-suppressions.xml
index e7de8febb2..a7f4ca16df 100644
--- a/ivy/dependency-check-ant/dependency-check-suppressions.xml
+++ b/ivy/dependency-check-ant/dependency-check-suppressions.xml
@@ -1,8 +1,3 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <suppressions xmlns="https://jeremylong.github.io/DependencyCheck/dependency-suppression.1.1.xsd">
-   <suppress>
-      <notes>only applies to tika-server &lt; 1.18</notes>
-      <gav regex="true">^org\.(apache\.tika:tika-(core|parsers)|gagravarr:vorbis-java-tika):.*$</gav>
-      <cve>CVE-2018-1335</cve>
-   </suppress>
 </suppressions>
diff --git a/src/java/overview.html b/src/java/overview.html
index 11321417ba..3de53a7d28 100644
--- a/src/java/overview.html
+++ b/src/java/overview.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+ 
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
 <head>
    <title>Apache Nutch</title>
diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt b/src/plugin/creativecommons/conf/crawl-urlfilter.txt
index 324617f07a..eb6786e4b4 100644
--- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt
+++ b/src/plugin/creativecommons/conf/crawl-urlfilter.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Creative Commnons crawl filter
 
 # Each non-comment, non-blank line contains a regular expression
diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml
index e28e12a9a8..4b343b2cc9 100644
--- a/src/plugin/creativecommons/conf/nutch-site.xml
+++ b/src/plugin/creativecommons/conf/nutch-site.xml
@@ -1,5 +1,21 @@
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 
 <!-- Creative Commons' Nutch configuration -->
 
diff --git a/src/plugin/creativecommons/data/anchor.html b/src/plugin/creativecommons/data/anchor.html
index 90b522759d..3267bc9ea8 100755
--- a/src/plugin/creativecommons/data/anchor.html
+++ b/src/plugin/creativecommons/data/anchor.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
 <html>
 <head>
diff --git a/src/plugin/creativecommons/data/rdf.html b/src/plugin/creativecommons/data/rdf.html
index fb2c34dfe5..60c27cc541 100755
--- a/src/plugin/creativecommons/data/rdf.html
+++ b/src/plugin/creativecommons/data/rdf.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 <html>
  <head>
diff --git a/src/plugin/creativecommons/data/rel.html b/src/plugin/creativecommons/data/rel.html
index 413d52f869..3d11572d82 100755
--- a/src/plugin/creativecommons/data/rel.html
+++ b/src/plugin/creativecommons/data/rel.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head>
 </head><body>
diff --git a/src/plugin/creativecommons/ivy.xml b/src/plugin/creativecommons/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/creativecommons/ivy.xml
+++ b/src/plugin/creativecommons/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/exchange-jexl/README.md b/src/plugin/exchange-jexl/README.md
index 2d2024276f..35a711b90c 100644
--- a/src/plugin/exchange-jexl/README.md
+++ b/src/plugin/exchange-jexl/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 exchange-jexl plugin for Nutch  
 ==============================
 
diff --git a/src/plugin/exchange-jexl/ivy.xml b/src/plugin/exchange-jexl/ivy.xml
index 1275664e5d..cb5a0f1862 100644
--- a/src/plugin/exchange-jexl/ivy.xml
+++ b/src/plugin/exchange-jexl/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/feed/ivy.xml b/src/plugin/feed/ivy.xml
index 7e3f4ede38..a7671307ba 100644
--- a/src/plugin/feed/ivy.xml
+++ b/src/plugin/feed/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/headings/ivy.xml b/src/plugin/headings/ivy.xml
index a8d6b9d48f..63007f93c6 100644
--- a/src/plugin/headings/ivy.xml
+++ b/src/plugin/headings/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-anchor/ivy.xml b/src/plugin/index-anchor/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/index-anchor/ivy.xml
+++ b/src/plugin/index-anchor/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-basic/ivy.xml b/src/plugin/index-basic/ivy.xml
index 673ea7f090..7bae19bb91 100644
--- a/src/plugin/index-basic/ivy.xml
+++ b/src/plugin/index-basic/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
index 2eda5a63fb..45a6388199 100644
--- a/src/plugin/index-geoip/ivy.xml
+++ b/src/plugin/index-geoip/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml
index c4efadf948..dda1b6a7be 100644
--- a/src/plugin/index-geoip/plugin.xml
+++ b/src/plugin/index-geoip/plugin.xml
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
 <!--
  Licensed to the Apache Software Foundation (ASF) under one or more
  contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-jexl-filter/ivy.xml b/src/plugin/index-jexl-filter/ivy.xml
index 624dcaf4a2..3d4fc905c3 100644
--- a/src/plugin/index-jexl-filter/ivy.xml
+++ b/src/plugin/index-jexl-filter/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-links/README.md b/src/plugin/index-links/README.md
index f25d1cf6da..ac0f071f45 100644
--- a/src/plugin/index-links/README.md
+++ b/src/plugin/index-links/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 indexer-links plugin for Nutch
 ==============================
 
diff --git a/src/plugin/index-links/ivy.xml b/src/plugin/index-links/ivy.xml
index 624dcaf4a2..3d4fc905c3 100644
--- a/src/plugin/index-links/ivy.xml
+++ b/src/plugin/index-links/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-metadata/ivy.xml b/src/plugin/index-metadata/ivy.xml
index 1275664e5d..cb5a0f1862 100644
--- a/src/plugin/index-metadata/ivy.xml
+++ b/src/plugin/index-metadata/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-more/ivy.xml b/src/plugin/index-more/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/index-more/ivy.xml
+++ b/src/plugin/index-more/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-replace/ivy.xml b/src/plugin/index-replace/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/index-replace/ivy.xml
+++ b/src/plugin/index-replace/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/index-replace/sample/testIndexReplace.html b/src/plugin/index-replace/sample/testIndexReplace.html
index 0b90fc2110..fb2ef03a59 100644
--- a/src/plugin/index-replace/sample/testIndexReplace.html
+++ b/src/plugin/index-replace/sample/testIndexReplace.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
   <head>
     <title>Testing the power of the index-replace plugin</title>
diff --git a/src/plugin/index-static/ivy.xml b/src/plugin/index-static/ivy.xml
index 1275664e5d..cb5a0f1862 100644
--- a/src/plugin/index-static/ivy.xml
+++ b/src/plugin/index-static/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/indexer-cloudsearch/README.md b/src/plugin/indexer-cloudsearch/README.md
index 10b5daa901..a0609c0fbb 100644
--- a/src/plugin/indexer-cloudsearch/README.md
+++ b/src/plugin/indexer-cloudsearch/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 AWS CloudSearch plugin for Nutch 
 ================================
 
diff --git a/src/plugin/indexer-cloudsearch/createCSDomain.sh b/src/plugin/indexer-cloudsearch/createCSDomain.sh
index 24fb0156c6..1cb8481fe0 100644
--- a/src/plugin/indexer-cloudsearch/createCSDomain.sh
+++ b/src/plugin/indexer-cloudsearch/createCSDomain.sh
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # example of domain configuration for CloudSearch
 
 DOMAIN="$1"
diff --git a/src/plugin/indexer-csv/README.md b/src/plugin/indexer-csv/README.md
index 80220974a7..4d1288b198 100644
--- a/src/plugin/indexer-csv/README.md
+++ b/src/plugin/indexer-csv/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 indexer-csv plugin for Nutch 
 ============================
 
diff --git a/src/plugin/indexer-csv/ivy.xml b/src/plugin/indexer-csv/ivy.xml
index 75b5d54e55..e7bf875468 100644
--- a/src/plugin/indexer-csv/ivy.xml
+++ b/src/plugin/indexer-csv/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/indexer-dummy/README.md b/src/plugin/indexer-dummy/README.md
index 2a4b2bd156..a7fa530090 100644
--- a/src/plugin/indexer-dummy/README.md
+++ b/src/plugin/indexer-dummy/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 indexer-dummy plugin for Nutch 
 ==============================
 
diff --git a/src/plugin/indexer-dummy/ivy.xml b/src/plugin/indexer-dummy/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/indexer-dummy/ivy.xml
+++ b/src/plugin/indexer-dummy/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/indexer-elastic/README.md b/src/plugin/indexer-elastic/README.md
index 466762e1c7..3dfd888ff8 100644
--- a/src/plugin/indexer-elastic/README.md
+++ b/src/plugin/indexer-elastic/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 indexer-elastic plugin for Nutch 
 ================================
 
diff --git a/src/plugin/indexer-elastic/howto_upgrade_es.txt b/src/plugin/indexer-elastic/howto_upgrade_es.md
similarity index 61%
rename from src/plugin/indexer-elastic/howto_upgrade_es.txt
rename to src/plugin/indexer-elastic/howto_upgrade_es.md
index a8156444c6..b57e0c02fa 100644
--- a/src/plugin/indexer-elastic/howto_upgrade_es.txt
+++ b/src/plugin/indexer-elastic/howto_upgrade_es.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 1. Upgrade Elasticsearch dependency in src/plugin/indexer-elastic/ivy.xml
 
 2. Upgrade the Elasticsearch specific dependencies in src/plugin/indexer-elastic/plugin.xml
diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml
index 7bdd94324a..9d605c50b5 100644
--- a/src/plugin/indexer-kafka/ivy.xml
+++ b/src/plugin/indexer-kafka/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/indexer-opensearch-1x/README.md b/src/plugin/indexer-opensearch-1x/README.md
index 52e5844af8..e5e76f0b60 100644
--- a/src/plugin/indexer-opensearch-1x/README.md
+++ b/src/plugin/indexer-opensearch-1x/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 indexer-opensearch1x plugin for Nutch 
 ================================
 
diff --git a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md
similarity index 62%
rename from src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt
rename to src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md
index 0725900445..c9b723ffcf 100644
--- a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt
+++ b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 1. Upgrade OpenSearch dependency in src/plugin/indexer-opensearch-1x/ivy.xml
 
 2. Upgrade the OpenSearch specific dependencies in src/plugin/indexer-opensearch-1x/plugin.xml
diff --git a/src/plugin/indexer-rabbit/README.md b/src/plugin/indexer-rabbit/README.md
index 6ea09a9151..8040cd6c76 100644
--- a/src/plugin/indexer-rabbit/README.md
+++ b/src/plugin/indexer-rabbit/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 indexer-rabbit plugin for Nutch
 ===============================
 
diff --git a/src/plugin/indexer-rabbit/ivy.xml b/src/plugin/indexer-rabbit/ivy.xml
index dd450cf7f0..d2daf91dad 100644
--- a/src/plugin/indexer-rabbit/ivy.xml
+++ b/src/plugin/indexer-rabbit/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/indexer-solr/README.md b/src/plugin/indexer-solr/README.md
index c3a4601e1b..3a27e4116c 100644
--- a/src/plugin/indexer-solr/README.md
+++ b/src/plugin/indexer-solr/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 indexer-solr plugin for Nutch 
 =============================
 
diff --git a/src/plugin/indexer-solr/howto_upgrade_solr.txt b/src/plugin/indexer-solr/howto_upgrade_solr.md
similarity index 60%
rename from src/plugin/indexer-solr/howto_upgrade_solr.txt
rename to src/plugin/indexer-solr/howto_upgrade_solr.md
index b2a7eb5c89..905fb84a9e 100644
--- a/src/plugin/indexer-solr/howto_upgrade_solr.txt
+++ b/src/plugin/indexer-solr/howto_upgrade_solr.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 1. Upgrade Solr dependency in src/plugin/indexer-solr/ivy.xml
 
 2. Upgrade the Solr specific dependencies in src/plugin/indexer-solr/plugin.xml
diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml
index ce59942daf..ab5fd72c7a 100644
--- a/src/plugin/indexer-solr/ivy.xml
+++ b/src/plugin/indexer-solr/ivy.xml
@@ -1,15 +1,20 @@
 <?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
 
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-	license agreements. See the NOTICE file distributed with this work for additional 
-	information regarding copyright ownership. The ASF licenses this file to 
-	You under the Apache License, Version 2.0 (the "License"); you may not use 
-	this file except in compliance with the License. You may obtain a copy of 
-	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-	by applicable law or agreed to in writing, software distributed under the 
-	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-	OF ANY KIND, either express or implied. See the License for the specific 
-	language governing permissions and limitations under the License. -->
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 
 <ivy-module version="1.0">
 	<info organisation="org.apache.nutch" module="${ant.project.name}">
diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml
index f672ac9ed0..21cc7d8bdf 100644
--- a/src/plugin/indexer-solr/plugin.xml
+++ b/src/plugin/indexer-solr/plugin.xml
@@ -1,14 +1,20 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-	license agreements. See the NOTICE file distributed with this work for additional 
-	information regarding copyright ownership. The ASF licenses this file to 
-	You under the Apache License, Version 2.0 (the "License"); you may not use 
-	this file except in compliance with the License. You may obtain a copy of 
-	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-	by applicable law or agreed to in writing, software distributed under the 
-	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-	OF ANY KIND, either express or implied. See the License for the specific 
-	language governing permissions and limitations under the License. -->
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <plugin id="indexer-solr" name="SolrIndexWriter" version="1.0.0"
 	provider-name="nutch.apache.org">
 
diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml
index 68e9ed76e1..f64b97055b 100644
--- a/src/plugin/language-identifier/ivy.xml
+++ b/src/plugin/language-identifier/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
index b03211667a..795e6b3358 100644
--- a/src/plugin/lib-htmlunit/ivy.xml
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/lib-http/ivy.xml b/src/plugin/lib-http/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/lib-http/ivy.xml
+++ b/src/plugin/lib-http/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/lib-nekohtml/ivy.xml b/src/plugin/lib-nekohtml/ivy.xml
index 072fb05b95..32fcd8c4b0 100644
--- a/src/plugin/lib-nekohtml/ivy.xml
+++ b/src/plugin/lib-nekohtml/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/lib-rabbitmq/ivy.xml b/src/plugin/lib-rabbitmq/ivy.xml
index 1b6ceac371..8184530afe 100644
--- a/src/plugin/lib-rabbitmq/ivy.xml
+++ b/src/plugin/lib-rabbitmq/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/lib-regex-filter/ivy.xml b/src/plugin/lib-regex-filter/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/lib-regex-filter/ivy.xml
+++ b/src/plugin/lib-regex-filter/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/lib-selenium/README.md b/src/plugin/lib-selenium/README.md
index 1c6b37c5f8..5054d7ad8e 100644
--- a/src/plugin/lib-selenium/README.md
+++ b/src/plugin/lib-selenium/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 # Updates
 * The use of phantomjs has been deprecated. Check [Wikipedia](https://en.wikipedia.org/wiki/PhantomJS) for more info.
 * The updated code for Safari webriver is under development as starting Safari 10 on OS X El Capitan and macOS Sierra, Safari comes bundled with a new driver implementation.
diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.md b/src/plugin/lib-selenium/howto_upgrade_selenium.md
new file mode 100644
index 0000000000..3071c74cbf
--- /dev/null
+++ b/src/plugin/lib-selenium/howto_upgrade_selenium.md
@@ -0,0 +1,32 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
+
+2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
+
+   To get a list of dependencies and their versions execute:
+    $ ant -f ./build-ivy.xml
+    $ ls lib | sed 's/^/     <library name="/g' | sed 's/$/">\n       <export name="*"\/>\n     <\/library>/g'
+
+   Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
+
+   N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows
+
+   $ brew install gnu-sed --with-default-names
+
+   You can then restart your terminal and the Regex + Sed command should work just fine!
diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt b/src/plugin/lib-selenium/howto_upgrade_selenium.txt
deleted file mode 100644
index 1892a6275e..0000000000
--- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
-
-2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
-
-   To get a list of dependencies and their versions execute:
-    $ ant -f ./build-ivy.xml
-    $ ls lib | sed 's/^/     <library name="/g' | sed 's/$/">\n       <export name="*"\/>\n     <\/library>/g'
-
-   Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
-
-   N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows
-
-   $ brew install gnu-sed --with-default-names
-
-   You can then restart your terminal and the Regex + Sed command should work just fine!
diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml
index 7d3a2d6242..0d460cdb4d 100644
--- a/src/plugin/lib-selenium/ivy.xml
+++ b/src/plugin/lib-selenium/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/lib-xml/ivy.xml b/src/plugin/lib-xml/ivy.xml
index 9306c4d9b9..4e38c43715 100644
--- a/src/plugin/lib-xml/ivy.xml
+++ b/src/plugin/lib-xml/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/microformats-reltag/ivy.xml b/src/plugin/microformats-reltag/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/microformats-reltag/ivy.xml
+++ b/src/plugin/microformats-reltag/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/mimetype-filter/ivy.xml b/src/plugin/mimetype-filter/ivy.xml
index 624dcaf4a2..3d4fc905c3 100644
--- a/src/plugin/mimetype-filter/ivy.xml
+++ b/src/plugin/mimetype-filter/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/nutch-extensionpoints/ivy.xml b/src/plugin/nutch-extensionpoints/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/nutch-extensionpoints/ivy.xml
+++ b/src/plugin/nutch-extensionpoints/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parse-ext/command b/src/plugin/parse-ext/command
index f42c055311..329d58d96d 100644
--- a/src/plugin/parse-ext/command
+++ b/src/plugin/parse-ext/command
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #!/bin/bash
 #
 # Sample bash script as external command invoked by parse-ext plugin
diff --git a/src/plugin/parse-ext/ivy.xml b/src/plugin/parse-ext/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/parse-ext/ivy.xml
+++ b/src/plugin/parse-ext/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parse-html/ivy.xml b/src/plugin/parse-html/ivy.xml
index 69aa2eba5f..1424c4d7a3 100644
--- a/src/plugin/parse-html/ivy.xml
+++ b/src/plugin/parse-html/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parse-js/ivy.xml b/src/plugin/parse-js/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/parse-js/ivy.xml
+++ b/src/plugin/parse-js/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parse-js/sample/parse_embedded_js_test.html b/src/plugin/parse-js/sample/parse_embedded_js_test.html
index 351beacc35..0409bba53b 100644
--- a/src/plugin/parse-js/sample/parse_embedded_js_test.html
+++ b/src/plugin/parse-js/sample/parse_embedded_js_test.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
 <html style="font-size: 16px;"><head>
 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
diff --git a/src/plugin/parse-js/sample/parse_pure_js_test.js b/src/plugin/parse-js/sample/parse_pure_js_test.js
index f196313f85..0e486a8793 100644
--- a/src/plugin/parse-js/sample/parse_pure_js_test.js
+++ b/src/plugin/parse-js/sample/parse_pure_js_test.js
@@ -1,3 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // test data for link extraction from "pure" JavaScript
 
 function selectProvider(form) {
diff --git a/src/plugin/parse-metatags/ivy.xml b/src/plugin/parse-metatags/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/parse-metatags/ivy.xml
+++ b/src/plugin/parse-metatags/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parse-metatags/sample/testMetatags.html b/src/plugin/parse-metatags/sample/testMetatags.html
index e9e8e6bd0c..4dc86c194b 100644
--- a/src/plugin/parse-metatags/sample/testMetatags.html
+++ b/src/plugin/parse-metatags/sample/testMetatags.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
 <head>
 <meta name="Keywords" content="This is a test of keywords" />
diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
index ca8b737c2b..36d2c8814a 100644
--- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
+++ b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
 <head>
 <meta name="DC.creator" content="Doug Cutting">
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.md
similarity index 73%
rename from src/plugin/parse-tika/howto_upgrade_tika.txt
rename to src/plugin/parse-tika/howto_upgrade_tika.md
index 46d075948b..8ed6c3f3cd 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 We are currently using a shim (https://github.com/tballison/hadoop-safe-tika
 because of binary conflicts in commons-io versions between what Hadoop supports and the more
 modern features that Apache Tika and Apache POI were using in commons-io.
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 1586d9661f..b89e812e18 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parse-tika/sample/nutch.html b/src/plugin/parse-tika/sample/nutch.html
index 0aa7c98959..8098535126 100644
--- a/src/plugin/parse-tika/sample/nutch.html
+++ b/src/plugin/parse-tika/sample/nutch.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
 <html>
 <head>
diff --git a/src/plugin/parse-zip/ivy.xml b/src/plugin/parse-zip/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/parse-zip/ivy.xml
+++ b/src/plugin/parse-zip/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parsefilter-debug/ivy.xml b/src/plugin/parsefilter-debug/ivy.xml
index dac80e6d7b..82f93c0122 100644
--- a/src/plugin/parsefilter-debug/ivy.xml
+++ b/src/plugin/parsefilter-debug/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml b/src/plugin/parsefilter-naivebayes/ivy.xml
index c261adac62..66a9315435 100644
--- a/src/plugin/parsefilter-naivebayes/ivy.xml
+++ b/src/plugin/parsefilter-naivebayes/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt b/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
index 9d15cd899b..fbc7dd3039 100644
--- a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
+++ b/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Example configuration file for parsefilter-regex
 #
 # Parse metadata field <name> is set to true if the HTML matches the regex. The
diff --git a/src/plugin/parsefilter-regex/ivy.xml b/src/plugin/parsefilter-regex/ivy.xml
index e82f92861b..f33a311786 100644
--- a/src/plugin/parsefilter-regex/ivy.xml
+++ b/src/plugin/parsefilter-regex/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-file/ivy.xml b/src/plugin/protocol-file/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/protocol-file/ivy.xml
+++ b/src/plugin/protocol-file/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-file/sample/testprotocolfile.txt b/src/plugin/protocol-file/sample/testprotocolfile.txt
index fbe8a8acf2..5e684e2f47 100644
--- a/src/plugin/protocol-file/sample/testprotocolfile.txt
+++ b/src/plugin/protocol-file/sample/testprotocolfile.txt
@@ -1 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 Protocol File Test
diff --git a/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt b/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
index fbe8a8acf2..5e684e2f47 100644
--- a/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
+++ b/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
@@ -1 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 Protocol File Test
diff --git a/src/plugin/protocol-foo/ivy.xml b/src/plugin/protocol-foo/ivy.xml
index 1a86d68030..99b1734468 100755
--- a/src/plugin/protocol-foo/ivy.xml
+++ b/src/plugin/protocol-foo/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-foo/plugin.xml b/src/plugin/protocol-foo/plugin.xml
index d34f6242a9..954a2d41aa 100755
--- a/src/plugin/protocol-foo/plugin.xml
+++ b/src/plugin/protocol-foo/plugin.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-ftp/ivy.xml b/src/plugin/protocol-ftp/ivy.xml
index 8e1c257d6a..1fbfe97f04 100644
--- a/src/plugin/protocol-ftp/ivy.xml
+++ b/src/plugin/protocol-ftp/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-htmlunit/ivy.xml b/src/plugin/protocol-htmlunit/ivy.xml
index dde1fe88ff..fa787376bc 100644
--- a/src/plugin/protocol-htmlunit/ivy.xml
+++ b/src/plugin/protocol-htmlunit/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-http/ivy.xml b/src/plugin/protocol-http/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/protocol-http/ivy.xml
+++ b/src/plugin/protocol-http/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml
index 378bd7c424..e3e515dd95 100644
--- a/src/plugin/protocol-httpclient/ivy.xml
+++ b/src/plugin/protocol-httpclient/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-interactiveselenium/README.md b/src/plugin/protocol-interactiveselenium/README.md
index dd43ee7948..545efb830a 100644
--- a/src/plugin/protocol-interactiveselenium/README.md
+++ b/src/plugin/protocol-interactiveselenium/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 Nutch Interactive Selenium
 ==========================
 
diff --git a/src/plugin/protocol-interactiveselenium/ivy.xml b/src/plugin/protocol-interactiveselenium/ivy.xml
index 506be0aecb..112483bcdc 100644
--- a/src/plugin/protocol-interactiveselenium/ivy.xml
+++ b/src/plugin/protocol-interactiveselenium/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt b/src/plugin/protocol-okhttp/howto_upgrade_okhttp.md
similarity index 52%
rename from src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt
rename to src/plugin/protocol-okhttp/howto_upgrade_okhttp.md
index b3b6f1f223..16ae70d71d 100644
--- a/src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt
+++ b/src/plugin/protocol-okhttp/howto_upgrade_okhttp.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 1. Upgrade OkHttp dependency in src/plugin/protocol-okhttp/ivy.xml
 
 2. Upgrade OkHttp's own dependencies in src/plugin/protocol-okhttp/plugin.xml
diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml
index ead8232474..73b4fa6369 100644
--- a/src/plugin/protocol-okhttp/ivy.xml
+++ b/src/plugin/protocol-okhttp/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/protocol-selenium/README.md b/src/plugin/protocol-selenium/README.md
index 05132b9ef1..4d43c330d5 100644
--- a/src/plugin/protocol-selenium/README.md
+++ b/src/plugin/protocol-selenium/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 Nutch Selenium
 ==============
 
diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml
index 506be0aecb..112483bcdc 100644
--- a/src/plugin/protocol-selenium/ivy.xml
+++ b/src/plugin/protocol-selenium/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/publish-rabbitmq/ivy.xml b/src/plugin/publish-rabbitmq/ivy.xml
index 7b5e3dd3cc..008cdb1ca8 100644
--- a/src/plugin/publish-rabbitmq/ivy.xml
+++ b/src/plugin/publish-rabbitmq/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/scoring-depth/ivy.xml b/src/plugin/scoring-depth/ivy.xml
index 1275664e5d..cb5a0f1862 100644
--- a/src/plugin/scoring-depth/ivy.xml
+++ b/src/plugin/scoring-depth/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/scoring-link/ivy.xml b/src/plugin/scoring-link/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/scoring-link/ivy.xml
+++ b/src/plugin/scoring-link/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/scoring-metadata/ivy.xml b/src/plugin/scoring-metadata/ivy.xml
index 24d76063dc..6fa1a2c06e 100644
--- a/src/plugin/scoring-metadata/ivy.xml
+++ b/src/plugin/scoring-metadata/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/scoring-opic/ivy.xml b/src/plugin/scoring-opic/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/scoring-opic/ivy.xml
+++ b/src/plugin/scoring-opic/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/scoring-orphan/ivy.xml b/src/plugin/scoring-orphan/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/scoring-orphan/ivy.xml
+++ b/src/plugin/scoring-orphan/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml
index 1acd1d442d..1a1945f57f 100644
--- a/src/plugin/scoring-similarity/ivy.xml
+++ b/src/plugin/scoring-similarity/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/subcollection/ivy.xml b/src/plugin/subcollection/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/subcollection/ivy.xml
+++ b/src/plugin/subcollection/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/tld/ivy.xml b/src/plugin/tld/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/tld/ivy.xml
+++ b/src/plugin/tld/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-automaton/ivy.xml b/src/plugin/urlfilter-automaton/ivy.xml
index 6b07ba33b8..e9b1e892f5 100644
--- a/src/plugin/urlfilter-automaton/ivy.xml
+++ b/src/plugin/urlfilter-automaton/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-domain/data/hosts.txt b/src/plugin/urlfilter-domain/data/hosts.txt
index 2b88c3b050..8cf43745fa 100644
--- a/src/plugin/urlfilter-domain/data/hosts.txt
+++ b/src/plugin/urlfilter-domain/data/hosts.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # comments start with the pound sign
 net
 apache.org
diff --git a/src/plugin/urlfilter-domain/ivy.xml b/src/plugin/urlfilter-domain/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-domain/ivy.xml
+++ b/src/plugin/urlfilter-domain/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-domaindenylist/data/hosts.txt b/src/plugin/urlfilter-domaindenylist/data/hosts.txt
index 2b88c3b050..8cf43745fa 100644
--- a/src/plugin/urlfilter-domaindenylist/data/hosts.txt
+++ b/src/plugin/urlfilter-domaindenylist/data/hosts.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # comments start with the pound sign
 net
 apache.org
diff --git a/src/plugin/urlfilter-domaindenylist/ivy.xml b/src/plugin/urlfilter-domaindenylist/ivy.xml
index 1275664e5d..cb5a0f1862 100644
--- a/src/plugin/urlfilter-domaindenylist/ivy.xml
+++ b/src/plugin/urlfilter-domaindenylist/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md
index 46b293fe87..2e58605752 100644
--- a/src/plugin/urlfilter-fast/README.md
+++ b/src/plugin/urlfilter-fast/README.md
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 
 Filters URLs based on a file of regular expressions using host/domains
 matching first. The default policy is to accept a URL if no matches
diff --git a/src/plugin/urlfilter-fast/ivy.xml b/src/plugin/urlfilter-fast/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-fast/ivy.xml
+++ b/src/plugin/urlfilter-fast/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-ignoreexempt/README.md b/src/plugin/urlfilter-ignoreexempt/README.md
index d48b6729f6..a8f932e759 100644
--- a/src/plugin/urlfilter-ignoreexempt/README.md
+++ b/src/plugin/urlfilter-ignoreexempt/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 urlfilter-ignoreexempt
 ======================
   This plugin allows certain urls to be exempted when the external links are configured to be ignored.
diff --git a/src/plugin/urlfilter-ignoreexempt/ivy.xml b/src/plugin/urlfilter-ignoreexempt/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-ignoreexempt/ivy.xml
+++ b/src/plugin/urlfilter-ignoreexempt/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-prefix/ivy.xml b/src/plugin/urlfilter-prefix/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-prefix/ivy.xml
+++ b/src/plugin/urlfilter-prefix/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-regex/ivy.xml b/src/plugin/urlfilter-regex/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-regex/ivy.xml
+++ b/src/plugin/urlfilter-regex/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-suffix/ivy.xml b/src/plugin/urlfilter-suffix/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-suffix/ivy.xml
+++ b/src/plugin/urlfilter-suffix/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-validator/ivy.xml b/src/plugin/urlfilter-validator/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-validator/ivy.xml
+++ b/src/plugin/urlfilter-validator/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlmeta/ivy.xml b/src/plugin/urlmeta/ivy.xml
index 1275664e5d..cb5a0f1862 100644
--- a/src/plugin/urlmeta/ivy.xml
+++ b/src/plugin/urlmeta/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-ajax/ivy.xml b/src/plugin/urlnormalizer-ajax/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlnormalizer-ajax/ivy.xml
+++ b/src/plugin/urlnormalizer-ajax/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-basic/ivy.xml b/src/plugin/urlnormalizer-basic/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlnormalizer-basic/ivy.xml
+++ b/src/plugin/urlnormalizer-basic/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-host/data/hosts.txt b/src/plugin/urlnormalizer-host/data/hosts.txt
index c7e0ccfe6c..b81edae147 100644
--- a/src/plugin/urlnormalizer-host/data/hosts.txt
+++ b/src/plugin/urlnormalizer-host/data/hosts.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Force all sub domains to www.
 *.example.com example.com
 
diff --git a/src/plugin/urlnormalizer-host/ivy.xml b/src/plugin/urlnormalizer-host/ivy.xml
index 624dcaf4a2..3d4fc905c3 100644
--- a/src/plugin/urlnormalizer-host/ivy.xml
+++ b/src/plugin/urlnormalizer-host/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-pass/ivy.xml b/src/plugin/urlnormalizer-pass/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlnormalizer-pass/ivy.xml
+++ b/src/plugin/urlnormalizer-pass/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-protocol/data/protocols.txt b/src/plugin/urlnormalizer-protocol/data/protocols.txt
index fc7d86cbd9..1599172528 100644
--- a/src/plugin/urlnormalizer-protocol/data/protocols.txt
+++ b/src/plugin/urlnormalizer-protocol/data/protocols.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Example configuration file for urlnormalizer-protocol
 #
 # URL's of hosts listed in the configuration are normalized to the target
diff --git a/src/plugin/urlnormalizer-protocol/ivy.xml b/src/plugin/urlnormalizer-protocol/ivy.xml
index 624dcaf4a2..3d4fc905c3 100644
--- a/src/plugin/urlnormalizer-protocol/ivy.xml
+++ b/src/plugin/urlnormalizer-protocol/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-querystring/ivy.xml b/src/plugin/urlnormalizer-querystring/ivy.xml
index 624dcaf4a2..3d4fc905c3 100644
--- a/src/plugin/urlnormalizer-querystring/ivy.xml
+++ b/src/plugin/urlnormalizer-querystring/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-regex/ivy.xml b/src/plugin/urlnormalizer-regex/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlnormalizer-regex/ivy.xml
+++ b/src/plugin/urlnormalizer-regex/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
index 7867ad80ea..8560961c0a 100644
--- a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
+++ b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # test simple removal of session id, keeping parameters before and after
 http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php
 http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php?f=2
diff --git a/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test b/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test
index 9d928802e8..9905e683d0 100644
--- a/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test
+++ b/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # test removal of subdomains
 http://www.foo.bar.com/ http://bar.com/
 
diff --git a/src/plugin/urlnormalizer-slash/data/slashes.txt b/src/plugin/urlnormalizer-slash/data/slashes.txt
index d3bd70a666..efcdafb630 100644
--- a/src/plugin/urlnormalizer-slash/data/slashes.txt
+++ b/src/plugin/urlnormalizer-slash/data/slashes.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Both domains have duplicate URL's, some with slashes and some without
 
 # We prefer this domain with slashes
diff --git a/src/plugin/urlnormalizer-slash/ivy.xml b/src/plugin/urlnormalizer-slash/ivy.xml
index 624dcaf4a2..3d4fc905c3 100644
--- a/src/plugin/urlnormalizer-slash/ivy.xml
+++ b/src/plugin/urlnormalizer-slash/ivy.xml
@@ -1,5 +1,4 @@
 <?xml version="1.0" ?>
-
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
diff --git a/src/test/crawl-tests.xml b/src/test/crawl-tests.xml
index 01fc683012..b1e38ad3a4 100644
--- a/src/test/crawl-tests.xml
+++ b/src/test/crawl-tests.xml
@@ -1,4 +1,20 @@
 <?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 
 <!-- Configuration overrides used during unit tests. -->
 
diff --git a/src/test/filter-all.txt b/src/test/filter-all.txt
index 4ed567ab1c..d738aec76a 100644
--- a/src/test/filter-all.txt
+++ b/src/test/filter-all.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Config file for urlfilter-suffix plugin
 # Filter away all urls
 
diff --git a/src/test/log4j.properties b/src/test/log4j.properties
index 3ff115f46f..08e272c712 100644
--- a/src/test/log4j.properties
+++ b/src/test/log4j.properties
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # log4j configuration used during build and unit tests
 
 log4j.rootLogger=info,stdout
diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml
index dd408739dc..0d6177e5e6 100644
--- a/src/test/nutch-site.xml
+++ b/src/test/nutch-site.xml
@@ -1,4 +1,20 @@
 <?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 
 <!-- Configuration overrides used during unit tests. -->
 
diff --git a/src/testresources/fetch-test-site/dup_of_pagea.html b/src/testresources/fetch-test-site/dup_of_pagea.html
index 6444c41225..63c4e61537 100644
--- a/src/testresources/fetch-test-site/dup_of_pagea.html
+++ b/src/testresources/fetch-test-site/dup_of_pagea.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
  <head>
   <title>page a</title>
diff --git a/src/testresources/fetch-test-site/exception.html b/src/testresources/fetch-test-site/exception.html
index e1192a176b..66f134ee25 100644
--- a/src/testresources/fetch-test-site/exception.html
+++ b/src/testresources/fetch-test-site/exception.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
 <HTML>
 <HEAD>
diff --git a/src/testresources/fetch-test-site/index.html b/src/testresources/fetch-test-site/index.html
index d73ff3f691..3fc6e61e5a 100644
--- a/src/testresources/fetch-test-site/index.html
+++ b/src/testresources/fetch-test-site/index.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
  <head>
   <title>front page</title>
diff --git a/src/testresources/fetch-test-site/nested_spider_trap.html b/src/testresources/fetch-test-site/nested_spider_trap.html
index 5dcf7c2209..dd32ee2362 100644
--- a/src/testresources/fetch-test-site/nested_spider_trap.html
+++ b/src/testresources/fetch-test-site/nested_spider_trap.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
 <head>
 <title>nested spider trap</title>
diff --git a/src/testresources/fetch-test-site/pagea.html b/src/testresources/fetch-test-site/pagea.html
index 6444c41225..63c4e61537 100644
--- a/src/testresources/fetch-test-site/pagea.html
+++ b/src/testresources/fetch-test-site/pagea.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
  <head>
   <title>page a</title>
diff --git a/src/testresources/fetch-test-site/pageb.html b/src/testresources/fetch-test-site/pageb.html
index 66e3725ef0..cf77ff4f75 100644
--- a/src/testresources/fetch-test-site/pageb.html
+++ b/src/testresources/fetch-test-site/pageb.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <html>
  <head>
   <title>bage b</title>
diff --git a/src/testresources/fetch-test-site/robots.txt b/src/testresources/fetch-test-site/robots.txt
index e69de29bb2..fc590f9733 100644
--- a/src/testresources/fetch-test-site/robots.txt
+++ b/src/testresources/fetch-test-site/robots.txt
@@ -0,0 +1,14 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file