From 5a223e172298eb83e8b0f6880364b4746ba760e5 Mon Sep 17 00:00:00 2001 From: tballison Date: Mon, 28 Aug 2023 10:05:44 -0400 Subject: [PATCH 01/28] NUTCH-2989 -- ElasticIndexWriter should enable auth for https, too --- .../apache/nutch/indexwriter/elastic/ElasticIndexWriter.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java index 053bfd68aa..290d9dfca2 100644 --- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java +++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java @@ -210,6 +210,9 @@ public HttpAsyncClientBuilder customizeHttpClient( restClientBuilder.setHttpClientConfigCallback(new HttpClientConfigCallback() { @Override public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) { + if (auth) { + httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider); + } // ignore issues with self-signed certificates httpClientBuilder.setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE); return httpClientBuilder.setSSLContext(sslContext); From 3bb8b0eeb90f7ba1304ef807cf87f28d0a6341f5 Mon Sep 17 00:00:00 2001 From: tballison Date: Wed, 30 Aug 2023 12:20:53 -0400 Subject: [PATCH 02/28] NUTCH-2999 -- upgrade Lucene to latest 8.x throughout --- src/plugin/indexer-elastic/plugin.xml | 28 +++++++++---------- src/plugin/indexer-opensearch-1x/plugin.xml | 28 +++++++++---------- src/plugin/parsefilter-naivebayes/ivy.xml | 11 +------- src/plugin/parsefilter-naivebayes/plugin.xml | 15 ---------- src/plugin/scoring-similarity/ivy.xml | 2 +- src/plugin/scoring-similarity/plugin.xml | 4 +-- .../similarity/util/LuceneAnalyzerUtil.java | 4 +-- .../similarity/util/LuceneTokenizer.java | 6 ++-- 8 files changed, 37 insertions(+), 61 deletions(-) diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml index 387a3ac664..679979d32a 100644 --- a/src/plugin/indexer-elastic/plugin.xml +++ b/src/plugin/indexer-elastic/plugin.xml @@ -48,20 +48,20 @@ - - - - - - - - - - - - - - + + + + + + + + + + + + + + diff --git a/src/plugin/indexer-opensearch-1x/plugin.xml b/src/plugin/indexer-opensearch-1x/plugin.xml index 1bf5affc2f..e1dde463dd 100644 --- a/src/plugin/indexer-opensearch-1x/plugin.xml +++ b/src/plugin/indexer-opensearch-1x/plugin.xml @@ -48,20 +48,20 @@ - - - - - - - - - - - - - - + + + + + + + + + + + + + + diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml b/src/plugin/parsefilter-naivebayes/ivy.xml index ca96ec57bc..c261adac62 100644 --- a/src/plugin/parsefilter-naivebayes/ivy.xml +++ b/src/plugin/parsefilter-naivebayes/ivy.xml @@ -35,15 +35,6 @@ - + - - - - - - - - - diff --git a/src/plugin/parsefilter-naivebayes/plugin.xml b/src/plugin/parsefilter-naivebayes/plugin.xml index 76f30de6b5..c4983e1c9b 100644 --- a/src/plugin/parsefilter-naivebayes/plugin.xml +++ b/src/plugin/parsefilter-naivebayes/plugin.xml @@ -25,21 +25,6 @@ - - - - - - - - - - - - - - - diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml index b889f8056c..1acd1d442d 100644 --- a/src/plugin/scoring-similarity/ivy.xml +++ b/src/plugin/scoring-similarity/ivy.xml @@ -36,7 +36,7 @@ - + diff --git a/src/plugin/scoring-similarity/plugin.xml b/src/plugin/scoring-similarity/plugin.xml index a0353c7189..4ed0592661 100644 --- a/src/plugin/scoring-similarity/plugin.xml +++ b/src/plugin/scoring-similarity/plugin.xml @@ -26,8 +26,8 @@ - - + + diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java index eae5ba5e45..0c1e5fc62d 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java @@ -23,10 +23,10 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.standard.ClassicTokenizer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.CharArraySet; /** @@ -54,7 +54,7 @@ public static enum StemFilterType { PORTERSTEM_FILTER, ENGLISHMINIMALSTEM_FILTER public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean useStopFilter) { LuceneAnalyzerUtil.stemFilterType = stemFilterType; if(useStopFilter) { - stopSet = StandardAnalyzer.STOP_WORDS_SET; + stopSet = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET; } else { stopSet = null; diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java index d09af8244d..8567a39b2c 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java @@ -23,10 +23,10 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.standard.ClassicTokenizer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.CharArraySet; @@ -56,7 +56,7 @@ public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopF this.tokenizer = tokenizer; this.stemFilterType = stemFilterType; if(useStopFilter) { - stopSet = StandardAnalyzer.STOP_WORDS_SET; + stopSet = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET; } tokenStream = createTokenStream(content); } @@ -78,7 +78,7 @@ public LuceneTokenizer(String content, TokenizerType tokenizer, List sto this.tokenizer = tokenizer; this.stemFilterType = stemFilterType; if(addToDefault) { - CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);; + CharArraySet stopSet = CharArraySet.copy(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); for(String word : stopWords){ stopSet.add(word); } From cf74770f9642356d4c2cdc9d6f41aaf8a8928bcf Mon Sep 17 00:00:00 2001 From: tballison Date: Wed, 30 Aug 2023 14:05:24 -0400 Subject: [PATCH 03/28] NUTCH-2978, upgrade to slf4j2 throughout, first steps --- ivy/ivy.xml | 19 +- src/plugin/any23/ivy.xml | 4 +- src/plugin/any23/plugin.xml | 325 ++++++++++++++++---------------- src/plugin/build-plugin.xml | 4 +- src/plugin/lib-rabbitmq/ivy.xml | 4 +- 5 files changed, 181 insertions(+), 175 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 18a6df2302..6366b891ce 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -36,9 +36,9 @@ - - - + + + @@ -51,10 +51,15 @@ - - - - + + + + + + + + + diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml index 3b755ee3fa..99f6d2da85 100644 --- a/src/plugin/any23/ivy.xml +++ b/src/plugin/any23/ivy.xml @@ -39,8 +39,8 @@ - - + + diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml index dae8c47aa3..25e1b42144 100644 --- a/src/plugin/any23/plugin.xml +++ b/src/plugin/any23/plugin.xml @@ -26,170 +26,167 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml index f6e87e8057..3f0d9ca44a 100755 --- a/src/plugin/build-plugin.xml +++ b/src/plugin/build-plugin.xml @@ -265,5 +265,7 @@ - + + + diff --git a/src/plugin/lib-rabbitmq/ivy.xml b/src/plugin/lib-rabbitmq/ivy.xml index 28665978b0..1b6ceac371 100644 --- a/src/plugin/lib-rabbitmq/ivy.xml +++ b/src/plugin/lib-rabbitmq/ivy.xml @@ -36,7 +36,9 @@ - + + + From 8d9c77fd1b044f7c8fc51b70e34321cb9260cfbb Mon Sep 17 00:00:00 2001 From: tballison Date: Wed, 30 Aug 2023 14:34:48 -0400 Subject: [PATCH 04/28] NUTCH-2999 -- upgrade lucene to latest 8.x throughout --- src/plugin/indexer-elastic/ivy.xml | 15 ++++ src/plugin/indexer-elastic/plugin.xml | 89 ++++++++++--------- .../howto_upgrade_opensearch.txt | 33 +++++++ src/plugin/indexer-opensearch-1x/ivy.xml | 15 ++++ src/plugin/indexer-opensearch-1x/plugin.xml | 88 +++++++++--------- 5 files changed, 152 insertions(+), 88 deletions(-) create mode 100644 src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml index abdcceae29..e5cdfdf656 100644 --- a/src/plugin/indexer-elastic/ivy.xml +++ b/src/plugin/indexer-elastic/ivy.xml @@ -40,7 +40,22 @@ + + + + + + + + + + + + + + + diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml index 679979d32a..fc3723a608 100644 --- a/src/plugin/indexer-elastic/plugin.xml +++ b/src/plugin/indexer-elastic/plugin.xml @@ -22,50 +22,51 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt new file mode 100644 index 0000000000..0725900445 --- /dev/null +++ b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt @@ -0,0 +1,33 @@ +1. Upgrade OpenSearch dependency in src/plugin/indexer-opensearch-1x/ivy.xml + +2. Upgrade the OpenSearch specific dependencies in src/plugin/indexer-opensearch-1x/plugin.xml + To get the list of dependencies and their versions execute: + $ cd src/plugin/indexer-opensearch-1x/ + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ /g' + + In the plugin.xml replace all lines between + + and + + with the output of the command above. + +4. (Optionally) remove overlapping dependencies between indexer-opensearch-1x and Nutch core dependencies: + - check for libs present both in + build/lib + and + build/plugins/indexer-opensearch-1x/ + (eventually with different versions) + - duplicated libs can be added to the exclusions of transitive dependencies in + build/plugins/indexer-opensearch-1x/ivy.xml + - but it should be made sure that the library versions in ivy/ivy.xml correspend to + those required by Tika + +5. Remove the locally "installed" dependencies in src/plugin/indexer-opensearch-1x/lib/: + + $ rm -rf lib/ + +6. Build Nutch and run all unit tests: + + $ cd ../../../ + $ ant clean runtime test \ No newline at end of file diff --git a/src/plugin/indexer-opensearch-1x/ivy.xml b/src/plugin/indexer-opensearch-1x/ivy.xml index 1505ad3c82..ae5d91e41e 100644 --- a/src/plugin/indexer-opensearch-1x/ivy.xml +++ b/src/plugin/indexer-opensearch-1x/ivy.xml @@ -40,7 +40,22 @@ + + + + + + + + + + + + + + + diff --git a/src/plugin/indexer-opensearch-1x/plugin.xml b/src/plugin/indexer-opensearch-1x/plugin.xml index e1dde463dd..ee0d45dc2a 100644 --- a/src/plugin/indexer-opensearch-1x/plugin.xml +++ b/src/plugin/indexer-opensearch-1x/plugin.xml @@ -22,50 +22,50 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From daedbc36ceeba506795973b75ead2f5b4b59ddd9 Mon Sep 17 00:00:00 2001 From: tballison Date: Thu, 31 Aug 2023 09:07:29 -0400 Subject: [PATCH 05/28] NUTCH-2978 -- exclude reload4j and update LICENSE-binary and NOTICE-binary. --- LICENSE-binary | 2 -- NOTICE-binary | 8 -------- ivy/ivy.xml | 17 +++++++++++++---- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/LICENSE-binary b/LICENSE-binary index 8e24a728e2..a0e05cbd1e 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -208,7 +208,6 @@ This product bundles some components that are also licensed under the Apache License Version 2.0: -ch.qos.reload4j:reload4j com.101tec:zkclient com.amazonaws:aws-java-sdk-cloudsearch com.amazonaws:aws-java-sdk-core @@ -758,7 +757,6 @@ org.jsoup:jsoup org.rypt:f8 org.slf4j:jcl-over-slf4j org.slf4j:slf4j-api -org.slf4j:slf4j-reload4j Mozilla Public License 1.1 (MPL 1.1) diff --git a/NOTICE-binary b/NOTICE-binary index 1aab2cb411..97e5b7d12f 100644 --- a/NOTICE-binary +++ b/NOTICE-binary @@ -163,10 +163,6 @@ AOP alliance (http://aopalliance.sourceforge.net) - license: Public Domain (licenses-binary/LICENSE-public-domain.txt) -# ch.qos.reload4j:reload4j -reload4j (https://reload4j.qos.ch) -- license: The Apache Software License, Version 2.0 - # com.101tec:zkclient ZkClient (https://github.com/sgroschupf/zkclient) - license: The Apache Software License, Version 2.0 @@ -1100,10 +1096,6 @@ JCL 1.2 implemented over SLF4J (http://www.slf4j.org) (licenses-binary/LICENSE-mit-license.txt) # org.slf4j:slf4j-api SLF4J API Module (http://www.slf4j.org) -- license: MIT License - (licenses-binary/LICENSE-mit-license.txt) -# org.slf4j:slf4j-reload4j -SLF4J Reload4j Binding (http://reload4j.qos.ch) - license: MIT License (licenses-binary/LICENSE-mit-license.txt) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 6366b891ce..9e19cec33d 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -50,15 +50,21 @@ - + + + + + - + + - + + @@ -111,7 +117,10 @@ - + + + + From 820d129a8adff9a34eed2ed3c04cfee377b56b63 Mon Sep 17 00:00:00 2001 From: tallison Date: Wed, 13 Sep 2023 10:26:25 -0400 Subject: [PATCH 06/28] NUTCH-3000 - the selenium protocol should return the full html, not just the inner body element. --- .../apache/nutch/protocol/selenium/HttpWebClient.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java index 4b998d1bc8..b0b12004da 100644 --- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java +++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java @@ -234,7 +234,7 @@ public static void cleanUpDriver(WebDriver driver) { } /** - * Function for obtaining the HTML BODY using the selected selenium * webdriver There are a number of configuration properties within * nutch-site.xml which determine whether to take screenshots of @@ -244,7 +244,7 @@ public static void cleanUpDriver(WebDriver driver) { * the URL to fetch and render * @param conf * the {@link org.apache.hadoop.conf.Configuration} - * @return the rendered inner HTML page + * @return the html page */ public static String getHtmlPage(String url, Configuration conf) { WebDriver driver = getDriverForPage(url, conf); @@ -253,10 +253,7 @@ public static String getHtmlPage(String url, Configuration conf) { if (conf.getBoolean("take.screenshot", false)) { takeScreenshot(driver, conf); } - - String innerHtml = driver.findElement(By.tagName("body")) - .getAttribute("innerHTML"); - return innerHtml; + return driver.getPageSource(); // I'm sure this catch statement is a code smell ; borrowing it from // lib-htmlunit From b6f645a4d025fa136f557dd37e9aba611b425fbb Mon Sep 17 00:00:00 2001 From: tallison Date: Wed, 13 Sep 2023 10:37:17 -0400 Subject: [PATCH 07/28] NUTCH-3001 - fix logic for grabbing bytes if there's no content type in the header --- .../nutch/protocol/selenium/HttpResponse.java | 78 +++++++++---------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index bb3bf6357c..7506773748 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -269,55 +269,51 @@ public HttpResponse(Http http, URL url, CrawlDatum datum) String contentType = getHeader(Response.CONTENT_TYPE); // handle with Selenium only if content type in HTML or XHTML - if (contentType != null) { - if (contentType.contains("text/html") - || contentType.contains("application/xhtml")) { - readPlainContent(url); - } else { - try { - int contentLength = Integer.MAX_VALUE; - String contentLengthString = headers.get(Response.CONTENT_LENGTH); - if (contentLengthString != null) { - try { - contentLength = Integer.parseInt(contentLengthString.trim()); - } catch (NumberFormatException ex) { - throw new HttpException( - "bad content length: " + contentLengthString); - } + if (contentType != null && + (contentType.contains("text/html") || contentType.contains("application/xhtml"))) { + readPlainContent(url); + } else { + try { + int contentLength = Integer.MAX_VALUE; + String contentLengthString = headers.get(Response.CONTENT_LENGTH); + if (contentLengthString != null) { + try { + contentLength = Integer.parseInt(contentLengthString.trim()); + } catch (NumberFormatException ex) { + throw new HttpException("bad content length: " + contentLengthString); } + } - if (http.getMaxContent() >= 0 - && contentLength > http.getMaxContent()) { - contentLength = http.getMaxContent(); - } + if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { + contentLength = http.getMaxContent(); + } - byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; - int bufferFilled = 0; - int totalRead = 0; - ByteArrayOutputStream out = new ByteArrayOutputStream(); - while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 - && totalRead + bufferFilled <= contentLength) { - totalRead += bufferFilled; - out.write(buffer, 0, bufferFilled); - } + byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; + int bufferFilled = 0; + int totalRead = 0; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 && + totalRead + bufferFilled <= contentLength) { + totalRead += bufferFilled; + out.write(buffer, 0, bufferFilled); + } - content = out.toByteArray(); + content = out.toByteArray(); - } catch (Exception e) { - if (code == 200) - throw new IOException(e.toString()); - // for codes other than 200 OK, we are fine with empty content - } finally { - if (in != null) { - in.close(); - } + } catch (Exception e) { + if (code == 200) { + throw new IOException(e.toString()); + } + // for codes other than 200 OK, we are fine with empty content + } finally { + if (in != null) { + in.close(); } - } - if (httpHeaders != null) { - headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); } } - + if (httpHeaders != null) { + headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); + } } catch(KeyManagementException | NoSuchAlgorithmException | KeyStoreException e) { throw new ProtocolException(e); } finally { From 8a5ef498c5d930c367ca72547d07d4aaa2d55254 Mon Sep 17 00:00:00 2001 From: tallison Date: Wed, 13 Sep 2023 11:03:09 -0400 Subject: [PATCH 08/28] Remove Any23 from Nutch --- LICENSE-binary | 5 - NOTICE-binary | 5 +- NOTICE.txt | 2 +- build.xml | 5 - conf/nutch-default.xml | 16 - default.properties | 4 +- src/plugin/any23/build-ivy.xml | 47 - src/plugin/any23/build.xml | 36 - src/plugin/any23/howto_upgrade_any23.txt | 22 - src/plugin/any23/ivy.xml | 49 - src/plugin/any23/plugin.xml | 216 - .../any23/sample/BBC_News_Scotland.html | 3780 ----------------- src/plugin/any23/sample/microdata_basic.html | 107 - .../nutch/any23/Any23IndexingFilter.java | 117 - .../apache/nutch/any23/Any23ParseFilter.java | 171 - .../org/apache/nutch/any23/package-info.java | 24 - .../nutch/any23/TestAny23IndexingFilter.java | 81 - .../nutch/any23/TestAny23ParseFilter.java | 119 - src/plugin/build.xml | 3 - src/plugin/parse-tika/howto_upgrade_tika.txt | 2 - 20 files changed, 3 insertions(+), 4808 deletions(-) delete mode 100644 src/plugin/any23/build-ivy.xml delete mode 100644 src/plugin/any23/build.xml delete mode 100644 src/plugin/any23/howto_upgrade_any23.txt delete mode 100644 src/plugin/any23/ivy.xml delete mode 100644 src/plugin/any23/plugin.xml delete mode 100644 src/plugin/any23/sample/BBC_News_Scotland.html delete mode 100644 src/plugin/any23/sample/microdata_basic.html delete mode 100644 src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java delete mode 100644 src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java delete mode 100644 src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java delete mode 100644 src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java delete mode 100644 src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java diff --git a/LICENSE-binary b/LICENSE-binary index 8e24a728e2..5e11ec79c9 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -327,11 +327,6 @@ net.sourceforge.owlapi:owlapi-impl net.sourceforge.owlapi:owlapi-parsers net.sourceforge.owlapi:owlapi-rio net.sourceforge.owlapi:owlapi-tools -org.apache.any23:apache-any23-api -org.apache.any23:apache-any23-core -org.apache.any23:apache-any23-csvutils -org.apache.any23:apache-any23-encoding -org.apache.any23:apache-any23-mime org.apache.avro:avro org.apache.commons:commons-collections4 org.apache.commons:commons-compress diff --git a/NOTICE-binary b/NOTICE-binary index 1aab2cb411..61a5c1d0d2 100644 --- a/NOTICE-binary +++ b/NOTICE-binary @@ -29,7 +29,7 @@ code and source code. The following provides more details on the included cryptographic software: -The plugins parse-tika and any23 use Apache Tika and the Bouncy Castle +The parse-tika plugin uses Apache Tika and the Bouncy Castle generic encryption libraries for extracting text content and metadata from encrypted PDF files. See for more details on Bouncy Castle and for details @@ -46,9 +46,6 @@ on Apache Tika. Apache projects --------------- -Apache Any23 (https://any23.apache.org/) - see https://github.com/apache/any23/blob/master/NOTICE.txt - Apache Avro (https://avro.apache.org) see https://github.com/apache/avro/blob/master/NOTICE.txt diff --git a/NOTICE.txt b/NOTICE.txt index 939ddc8031..4fdd968ab0 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -29,7 +29,7 @@ code and source code. The following provides more details on the included cryptographic software: -The plugins parse-tika and any23 use Apache Tika and the Bouncy Castle +The parse-tika plugin uses Apache Tika and the Bouncy Castle generic encryption libraries for extracting text content and metadata from encrypted PDF files. See for more details on Bouncy Castle and for details diff --git a/build.xml b/build.xml index 9326a8ba21..b44581405a 100644 --- a/build.xml +++ b/build.xml @@ -202,7 +202,6 @@ - @@ -687,7 +686,6 @@ - @@ -772,7 +770,6 @@ - @@ -1180,8 +1177,6 @@ - - diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index e98bd55708..58455b338c 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1353,22 +1353,6 @@ - - - - - - any23.extractors - html-microdata - Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html) - - - - any23.content_types - text/html,application/xhtml+xml - Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported. - - diff --git a/default.properties b/default.properties index df96199c1e..17e0bffbbc 100644 --- a/default.properties +++ b/default.properties @@ -210,6 +210,4 @@ plugins.misc=\ org.apache.nutch.collection*:\ org.apache.nutch.analysis.lang*:\ org.creativecommons.nutch*:\ - org.apache.nutch.microformats.reltag*:\ - org.apache.nutch.any23* - + org.apache.nutch.microformats.reltag*: diff --git a/src/plugin/any23/build-ivy.xml b/src/plugin/any23/build-ivy.xml deleted file mode 100644 index 6c7c6b906b..0000000000 --- a/src/plugin/any23/build-ivy.xml +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/any23/build.xml b/src/plugin/any23/build.xml deleted file mode 100644 index 790b18548d..0000000000 --- a/src/plugin/any23/build.xml +++ /dev/null @@ -1,36 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/any23/howto_upgrade_any23.txt b/src/plugin/any23/howto_upgrade_any23.txt deleted file mode 100644 index 32f9162f41..0000000000 --- a/src/plugin/any23/howto_upgrade_any23.txt +++ /dev/null @@ -1,22 +0,0 @@ -1. Upgrade Any23 dependency in src/plugin/any23/ivy.xml - -2. Upgrade Any23's own dependencies in src/plugin/any23/plugin.xml - To get the list of dependencies and their versions execute: - $ cd src/plugin/any23/ - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ /g' - - In the plugin.xml replace all lines between - - and - - with the output of the command above. - -3. Remove the locally "installed" dependencies in src/plugin/any23/lib/: - - $ rm -rf lib/ - -4. Build Nutch and run all unit tests: - - $ cd ../../../ - $ ant clean runtime test diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml deleted file mode 100644 index 3b755ee3fa..0000000000 --- a/src/plugin/any23/ivy.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - Apache Nutch - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml deleted file mode 100644 index dae8c47aa3..0000000000 --- a/src/plugin/any23/plugin.xml +++ /dev/null @@ -1,216 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/any23/sample/BBC_News_Scotland.html b/src/plugin/any23/sample/BBC_News_Scotland.html deleted file mode 100644 index d7cb10a826..0000000000 --- a/src/plugin/any23/sample/BBC_News_Scotland.html +++ /dev/null @@ -1,3780 +0,0 @@ - - - - - - BBC News - Scotland - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
- - -

- BBC News - Scotland -

- - -
- -
- - - - RSS feed - - - - - - -
- -
- - -
- -
- -
-
- - - -
- - - - - -
-
- 31 March 2014 -Last updated at 14:53 - -
- -
- -
- -
- - -
- -
- - - - - - -
- -

- - Australian firm to create 110 jobsClough graphic -

- -

An Australian firm which services the energy, chemical and -mining sectors is to open a base in Scotland, creating 110 new jobs. - -

-
-
- - - - - - - - - -
- -
- - - - - - - - - - - - - -

- Hearts won Sunday's Edinburgh derby 2-0Hearts in fight to stay afloat - -

- - -

Hearts face the prospect of running out of money by the end -of April if a deal to take the club out of administration is not agreed -soon. BBC Sport - -

- -
-
- - - - - - - - - - - -
- -
- - - - - - - - - - - - - -

- Primary school pupil with the Queen's baton in St HelenaEmail error on Queen's Baton bearers - -

- - -

Blank emails are sent to some people waiting to find out -whether they have been chosen to carry the Queen's Baton before the -Commonwealth Games. -

- - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - -
- - -
- - - -
-

Also In The News

- - - -
- - - - - - - - -

More news from around Scotland

- -
- - -
- - - - - - - - - - - - -
-
-
- - - - - - - - -
- -

Edinburgh, Fife & East -

- - - -
- - - - - - - -
- -

Glasgow & West -

- - - -
- - - - - - - -
- -

Highlands & Islands -

- - - -
- - - - - - - -
- -

North East, Orkney & Shetland -

- - - -
- - - - - - - -
- -

South Scotland -

- - - -
- - - - - - - -
- -

Tayside & Central -

- - - -
- - -
-
- - -
-

Our Experts

- -
-
- - Brian Taylor, Political editor, Scotland - Article written by Brian Taylor - Brian Taylor - - Political editor, Scotland - - - - -
- -
-

Contest of doubt and reassurance

- -

Does it matter that an unnamed minister of unknown status - follows an undiscernible motivation and gives an off-the-record comment - to The Guardian? Frankly, yes it does.

-

- - Read full article -

-
- - -
- - - - -
-
- -
-
- - Douglas Fraser, Business and economy editor, Scotland - Article written by Douglas Fraser - Douglas Fraser - - Business and economy editor, Scotland - - - - -
- -
-

Has Scotland ‘de-globalised’?

- -

As Scotland looks to a choice on its future, two academic - contributions give us a new take on the economic route that got us to -where we are now.

-

- - Read full article -

-
- - -
- - - - -
-
- -
- - -
- -

- Special Reports -

- - -
- - - - -
- - - - - - - - - - - - - - -

- Saltire and union flagsScotland's Future - -

- -

Latest news, background and analysis on the referendum on Scottish independence

- - -
- -
- - -
- -
-

More Special Reports:

- -
-
- -
- - - - -
- -

More from Scotland

- - - - - - - - -
- -

Politics

- - -
- - - - - - - - -
- -

Business

- - -
- - - - - - - - -
- -

Sport

- - -
- - - - - - - - -
- -

Naidheachdan

- - -
- -
- - - - - - - - - - - -
- - -
- -
- -
- -
- - - -
- - -
- -

Watch/Listen

- -
- - - -
- -
- - - - - -
- - - - -
- - -
- - -
- -
- - -
- - -
-

Features & Analysis

- -
    - - - - - - - - - - - - -
  • - - - - - - - - - - - - - - -

    - OtterNatural selection - -

    - - -

    The winning entries from the 2013 Scottish Nature Photography Awards -

    - -
  • - - - - - - - - - - - - - - - - -
  • - - - - - - - - - - - - - - -

    - Dundee waterfront imageMuseum with a mission - -

    - - -

    V&A Dundee aims to change thinking about design -

    - -
    -
  • - - -
  • - - - - - - - - - - - - - - -

    - Henry McLeish, former Labour First MinisterGot a question? - -

    - - -

    Former FM Henry McLeish in the referendum webcast hot-seat -

    - -
    -
  • - - -
  • - - - - - - - - - - - - - - -

    - A close up of her face, some sort of realisation dawns upon her. She stares open-mouthed.The Key - -

    - - -

    A novel in graphic art form on the theme of freedom -

    - -
    -
  • - - -
  • - - - - - - - - - - - - - - -

    - Belladrum poster illustrationsArt attack - -

    - - -

    How the idea for Belladrum's 50ft woman grew -

    - -
    -
  • - - -
  • - - - - - - - - - - - - - - -

    - Tractor spreading muckYour pictures - -

    - - -

    A selection of your pictures taken across Scotland -

    - -
    -
  • - - -
  • - - - - - - - - - - - - - - -

    - Scotland's newspapersScottish papers - -

    - - -

    Newspaper review: Scotland's front pages -

    - -
    -
  • - - -
  • - - - - - - - - - - - - - - -

    - Saltire and union flagJoin in - -

    - - -

    Apply to take part in a TV referendum debate -

    - -
    -
  • - - -
  • - - - - - - - - - - - - - - -

    - TwitterSee our tweets - -

    - - -

    Follow the latest BBC Scotland News updates on Twitter -

    - -
    -
  • - - - - - - - - - - - - - - - - -
  • - - - - - - - - - - - - - - -

    - Take part - -

    - - -

    Join Brian Taylor's Big Debate audience -

    - -
  • - - -
  • - - - - - - - - - - - - - - -

    - Send us your pictures - -

    - - -

    How to send us your images from across Scotland -

    - -
  • - - -
  • - - - - - - - - - - - - - - -

    - Scotland's future - -

    - - -

    Latest news, background and analysis on the 2014 referendum -

    - -
  • - - -
  • - - - - - - - - - - - - - - -

    - We're on Facebook - -

    - - -

    Join us to get the highlights from BBC Scotland news -

    - -
  • - - - - - - -
-
- - -
- - - - - - - -
- -
- - - - - -
-
- -

Elsewhere on the BBC

- -
    - - - -
  • - - - - - - - - - - - - - -

    - Queen's Baton RelayBaton relay - -

    - -

    70 nations and territories, 288 days - Mark Beaumont travels the Commonwealth

    -
  • -
- -
- -
- - -
- - - -
- -
- - - - -
- - - - - -
- - - - -
-

Programmes

- - - BBC iPlayer - - -
    - - - -
  • - - - - - - - - - - - - - -

    - Reporting ScotlandReporting Scotland Watch - -

    - -

    The latest news and weather from around Scotland.

    -
    -
  • -
- -
- -
- -
- - -
- -
- - -
- -
- - - -
- -
- - -
- - - -
- - -
- -
- - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/plugin/any23/sample/microdata_basic.html b/src/plugin/any23/sample/microdata_basic.html deleted file mode 100644 index 3ffca84251..0000000000 --- a/src/plugin/any23/sample/microdata_basic.html +++ /dev/null @@ -1,107 +0,0 @@ - - - - - - - -
-

My name is Elizabeth.

-
- -
-

My name is Daniel.

-
- - -
-

My name is Neil.

-

My band is called Four Parts Water.

-

I am British.

-
- - -
- Google -
- - -
- I was born on . -
- - -
-

Flavors in my favorite ice cream:

-
    -
  • Lemon sorbet
  • -
  • Apricot sorbet
  • -
-
- - -
- orange -
- - -
- -
The Castle (1986)
-
- - - -
- -
The Castle (1986)
-
- - -
-

Hedral

-

Hedral is a male american domestic shorthair, - with a fluffy black fur with white paws and belly.

- -
- - -
-
Title -
The Reality Dysfunction -
Author -
Publication date -
- -
- - -
-

Hedral

-

Hedral is a male american domestic shorthair, with a fluffy - black fur with - white paws and belly.

- -
- - - - \ No newline at end of file diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java deleted file mode 100644 index 09dc32e02d..0000000000 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.any23; - -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.Parse; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - *

This implementation of {@link org.apache.nutch.indexer.IndexingFilter} - * adds a triple(s) field to the {@link org.apache.nutch.indexer.NutchDocument}.

- *

Triples are extracted via Apache Any23.

- * @see org.apache.nutch.any23.Any23ParseFilter - */ -public class Any23IndexingFilter implements IndexingFilter { - - /** Logging instance */ - private static final Logger LOG = LoggerFactory.getLogger(Any23IndexingFilter.class); - - public static final String STRUCTURED_DATA = "structured_data"; - - private Configuration conf; - - /** - * Get the {@link Configuration} object - * @see org.apache.hadoop.conf.Configurable#getConf() - */ - @Override - public Configuration getConf() { - return this.conf; - } - - /** - * Set the {@link Configuration} object - * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration) - */ - @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - /** - * - * @param doc - * document instance for collecting fields - * @param parse - * parse data instance - * @param url - * page url - * @param datum - * crawl datum for the page (fetch datum from segment containing - * fetch status and fetch time) - * @param inlinks - * page inlinks - * @return filtered NutchDocument - * @see org.apache.nutch.indexer.IndexingFilter#filter(NutchDocument, Parse, Text, CrawlDatum, Inlinks) - * - * @throws IndexingException if there is a fatl error whilst indexing - */ - @Override - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { - String[] metadata = parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES); - - if (metadata != null) { - for (String triple : metadata) { - Pattern pattern = Pattern.compile("^([^ ]+) ([^ ]+) (.+) \\."); - Matcher matcher = pattern.matcher(triple); - if (matcher.find()) { - Map map = new HashMap<>(); - map.put("node", matcher.group(1)); - map.put("key", matcher.group(2)); - map.put("short_key", keyToShortKey(matcher.group(2))); - map.put("value", matcher.group(3)); - doc.add("structured_data", map); - } else { - LOG.warn("Unsupported triple format " + triple); - } - } - } - return doc; - } - - private static String keyToShortKey(String key) { - if (key.startsWith("<") && key.endsWith(">")) { - key = key.substring(1, key.length() - 1); - } - String[] keyParts = key.split("/"); - String[] keySubParts = keyParts[keyParts.length - 1].split("#"); - return keySubParts[keySubParts.length - 1]; - } -} diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java deleted file mode 100644 index bed659f352..0000000000 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.any23; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.charset.Charset; -import java.util.Arrays; -import java.util.Collections; -import java.util.Set; -import java.util.TreeSet; - -import org.apache.any23.Any23; -import org.apache.any23.extractor.ExtractionException; -import org.apache.any23.filter.IgnoreAccidentalRDFa; -import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments; -import org.apache.any23.mime.TikaMIMETypeDetector; -import org.apache.any23.mime.purifier.WhiteSpacesPurifier; -import org.apache.any23.writer.BenchmarkTripleHandler; -import org.apache.any23.writer.NTriplesWriter; -import org.apache.any23.writer.TripleHandler; -import org.apache.any23.writer.TripleHandlerException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.protocol.Content; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.DocumentFragment; - -/** - *

This implementation of {@link org.apache.nutch.parse.HtmlParseFilter} - * uses the Apache Any23 library - * for parsing and extracting structured data in RDF format from a - * variety of Web documents. The supported formats can be found at Apache Any23. - *

In this implementation triples are written as Notation3 - * and triples are identified within output triple streams by the presence of '\n'. - * The presence of the '\n' is a characteristic specific to N3 serialization in Any23. - * In order to use another/other writers implementing the - * TripleHandler - * interface, we will most likely need to identify an alternative data characteristic - * which we can use to split triples streams.

- */ -public class Any23ParseFilter implements HtmlParseFilter { - - /** Logging instance */ - private static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class); - - private Configuration conf = null; - - /** - * Constant identifier used as a Key for writing and reading - * triples to and from the metadata Map field. - */ - public static final String ANY23_TRIPLES = "Any23-Triples"; - - public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors"; - public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types"; - - private static class Any23Parser { - - Set triples = null; - - Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException { - this.triples = new TreeSet<>(); - try { - parse(url, htmlContent, contentType, extractorNames); - } catch (URISyntaxException e) { - LOG.error("Error parsing URI: {}", url, e); - throw new RuntimeException(e.getReason()); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Maintains a {@link java.util.Set} containing the triples - * @return a {@link java.util.Set} of triples. - */ - Set getTriples() { - return this.triples; - } - - private void parse(String url, String htmlContent, String contentType, String... extractorNames) - throws URISyntaxException, IOException, TripleHandlerException { - Any23 any23 = new Any23(extractorNames); - any23.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier())); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try (TripleHandler tHandler = new IgnoreTitlesOfEmptyDocuments( - new IgnoreAccidentalRDFa( - new NTriplesWriter(baos))); - BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler)) { - try { - any23.extract(htmlContent, url, contentType, "UTF-8", bHandler); - } catch (IOException e) { - LOG.error("Error while reading the source", e); - } catch (ExtractionException e) { - LOG.error("Error while extracting structured data", e); - } - - LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report()); - - String n3 = baos.toString("UTF-8"); - String[] triplesStrings = n3.split("\n"); - Collections.addAll(this.triples, triplesStrings); - } catch (IOException e) { - LOG.error("Unexpected IOException", e); - } - } - } - - @Override - public Configuration getConf() { - return this.conf; - } - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - /** - * @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment) - */ - @Override - public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { - String[] extractorNames = this.conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta"); - String[] supportedContentTypes = this.conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml"); - String contentType = content.getContentType(); - if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) { - LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType); - return parseResult; - } - - Any23Parser parser; - try { - String htmlContent = new String(content.getContent(), Charset.forName("UTF-8")); - parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames); - } catch (TripleHandlerException e) { - throw new RuntimeException("Error running Any23 parser: " + e.getMessage()); - } - Set triples = parser.getTriples(); - - Parse parse = parseResult.get(content.getUrl()); - Metadata metadata = parse.getData().getParseMeta(); - - for (String triple : triples) { - metadata.add(ANY23_TRIPLES, triple); - } - - return parseResult; - } -} diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java b/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java deleted file mode 100644 index 47010768c6..0000000000 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * This packages uses the Apache Any23 library - * for parsing and extracting structured data in RDF format from a - * variety of Web documents. The supported formats can be found - * at Apache Any23. - */ -package org.apache.nutch.any23; diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java deleted file mode 100644 index 1367e19c46..0000000000 --- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.any23; - -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.io.Text; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestAny23IndexingFilter { - @Test - public void testAny23TriplesFields() throws Exception { - Configuration conf = NutchConfiguration.create(); - Any23IndexingFilter filter = new Any23IndexingFilter(); - filter.setConf(conf); - Assert.assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "The Foo Page", - new Outlink[] { }, new Metadata()); - ParseImpl parse = new ParseImpl("test page", parseData); - String[] triples = new String[]{ - " .", - " \"77\" .", - " \"Zurique\"@pt ." - }; - for (String triple : triples) { - parse.getData().getParseMeta().add(Any23ParseFilter.ANY23_TRIPLES, triple); - } - try { - doc = filter.filter(doc, parse, new Text("http://nutch.apache.org/"), new CrawlDatum(), new Inlinks()); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.getMessage()); - } - List docTriples = doc.getField(Any23IndexingFilter.STRUCTURED_DATA).getValues(); - Assert.assertEquals(docTriples.size(), triples.length); - - Object triple = docTriples.get(0); - Assert.assertTrue(triple instanceof Map); - @SuppressWarnings("unchecked") - Map structuredData = (Map) triple; - Assert.assertEquals(structuredData.get("node"), ""); - Assert.assertEquals(structuredData.get("key"), ""); - Assert.assertEquals(structuredData.get("short_key"), "sameAs"); - Assert.assertEquals(structuredData.get("value"), ""); - - triple = docTriples.get(1); - Assert.assertTrue(triple instanceof Map); - structuredData = (Map) triple; - Assert.assertEquals(structuredData.get("node"), ""); - Assert.assertEquals(structuredData.get("key"), ""); - Assert.assertEquals(structuredData.get("short_key"), "yearHumidity"); - Assert.assertEquals(structuredData.get("value"), "\"77\""); - } -} diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java deleted file mode 100644 index 09c253fbc5..0000000000 --- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.any23; - -import java.io.File; -import java.io.IOException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParserNotFound; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -public class TestAny23ParseFilter { - - - private Configuration conf; - - private String fileSeparator = System.getProperty("file.separator"); - - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/any23/build.xml during plugin compilation. - private String file1 = "BBC_News_Scotland.html"; - - private String file2 = "microdata_basic.html"; - - private static final int EXPECTED_TRIPLES_1 = 79; - - private static final int EXPECTED_TRIPLES_2 = 40; - - @Before - public void setUp() { - this.conf = NutchConfiguration.create(); - conf.set("file.content.limit", "-1"); - conf.set("parser.timeout", "-1"); - conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links," - + "html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard," - + "html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate," - + "html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath"); - conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html"); - } - - @Test - public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException { - String[] triplesArray = getTriples(file1); - - Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter", - EXPECTED_TRIPLES_1, triplesArray.length); - } - - @Test - public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException { - String[] triplesArray = getTriples(file2); - - Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter", - EXPECTED_TRIPLES_2, triplesArray.length); - } - - @Test - public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException { - String[] triplesArray = getTriples(file1, "application/pdf"); - - Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored", - 0, triplesArray.length); - } - - public String[] extract(String urlString, File file, String contentType) { - try { - System.out.println(urlString); - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - content.setContentType(contentType); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.toString()); - } - return null; - } - - private String[] getTriples(String fileName) { - return getTriples(fileName, "text/html"); - } - - private String[] getTriples(String fileName, String contentType) { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - - File file = new File(sampleDir + fileSeparator + fileName); - - return extract(urlString, file, contentType); - } -} diff --git a/src/plugin/build.xml b/src/plugin/build.xml index e83f252734..34688ed566 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -35,7 +35,6 @@ - @@ -114,7 +113,6 @@ - @@ -176,7 +174,6 @@ - diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt index cb3ed6be87..2dcf76c455 100644 --- a/src/plugin/parse-tika/howto_upgrade_tika.txt +++ b/src/plugin/parse-tika/howto_upgrade_tika.txt @@ -33,8 +33,6 @@ $ cd ../language-identifier/ -It should be noted that Any23 also has a dependency on Tika so you may wish to check that there are no classpath conflicts in the any23 plugin as well. - 7. Build Nutch and run all unit tests: $ cd ../../../ From 10f7c0c5823ae4b7867c89339acec64fec277058 Mon Sep 17 00:00:00 2001 From: tallison Date: Thu, 14 Sep 2023 13:50:58 -0400 Subject: [PATCH 09/28] NUTCH-2959 -- bump Tika to 2.9.0 --- ivy/ivy.xml | 2 +- src/plugin/language-identifier/ivy.xml | 2 +- src/plugin/language-identifier/plugin.xml | 8 +- src/plugin/parse-tika/ivy.xml | 3 +- src/plugin/parse-tika/plugin.xml | 123 +++++++++++----------- 5 files changed, 71 insertions(+), 67 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 18a6df2302..484da135b8 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -56,7 +56,7 @@ - + diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml index 395047c6fc..5c357a75b6 100644 --- a/src/plugin/language-identifier/ivy.xml +++ b/src/plugin/language-identifier/ivy.xml @@ -36,7 +36,7 @@ - + diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml index 357c4a67cd..28cfd70317 100644 --- a/src/plugin/language-identifier/plugin.xml +++ b/src/plugin/language-identifier/plugin.xml @@ -27,15 +27,15 @@ - - + + - + - + diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index f0ec7a8d8c..d4718ea498 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -36,7 +36,7 @@ - + @@ -53,6 +53,7 @@ + diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml index d88405bc1c..3bc0a822cb 100644 --- a/src/plugin/parse-tika/plugin.xml +++ b/src/plugin/parse-tika/plugin.xml @@ -26,84 +26,87 @@ - - - - - - - + + + + + + + - - - + + + - - + + - + - - + + + + + - - - + + + - - + + - - - - + + + + + - - - - - - - - - + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - From 51055ef47ac09e082ae74bfa2720a84af431da19 Mon Sep 17 00:00:00 2001 From: tballison Date: Thu, 14 Sep 2023 14:42:01 -0400 Subject: [PATCH 10/28] NUTCH-2978 -- update slf4j-api --- ivy/ivy.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 9e19cec33d..d2f86ded70 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -39,7 +39,7 @@ - + From 6bfeaf4e2b9042ac3d9787fbbf558b16310c099a Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 18 Sep 2023 15:07:40 -0400 Subject: [PATCH 11/28] NUTCH-2959 -- bump Tika to 2.9.0, bump common dependencies throughout --- ivy/ivy.xml | 14 +++++++------- src/plugin/indexer-cloudsearch/plugin.xml | 6 +++--- src/plugin/indexer-kafka/plugin.xml | 6 +++--- src/plugin/indexer-solr/plugin.xml | 2 +- src/plugin/lib-htmlunit/plugin.xml | 6 +++--- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 9aceed2c13..ce6ee002bb 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -41,11 +41,11 @@ - + - - + + @@ -88,10 +88,10 @@ - - - - + + + + diff --git a/src/plugin/indexer-cloudsearch/plugin.xml b/src/plugin/indexer-cloudsearch/plugin.xml index 5b4425359a..f18bc49eab 100644 --- a/src/plugin/indexer-cloudsearch/plugin.xml +++ b/src/plugin/indexer-cloudsearch/plugin.xml @@ -29,9 +29,9 @@ - - - + + + diff --git a/src/plugin/indexer-kafka/plugin.xml b/src/plugin/indexer-kafka/plugin.xml index c5cc21c01c..e49b6d4c30 100644 --- a/src/plugin/indexer-kafka/plugin.xml +++ b/src/plugin/indexer-kafka/plugin.xml @@ -25,9 +25,9 @@ - - - + + + diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml index d49641cf9c..f672ac9ed0 100644 --- a/src/plugin/indexer-solr/plugin.xml +++ b/src/plugin/indexer-solr/plugin.xml @@ -17,7 +17,7 @@ - + diff --git a/src/plugin/lib-htmlunit/plugin.xml b/src/plugin/lib-htmlunit/plugin.xml index 95caaa3201..6f14209af4 100644 --- a/src/plugin/lib-htmlunit/plugin.xml +++ b/src/plugin/lib-htmlunit/plugin.xml @@ -50,16 +50,16 @@ - + - + - + From f11d383b62ea4dd7ff988a9962f1d5ccb3c82d10 Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 19 Sep 2023 09:38:17 -0400 Subject: [PATCH 12/28] NUTCH-2959 -- bump commons-io --- ivy/ivy.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index ce6ee002bb..2ef3599ab3 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -45,6 +45,7 @@ + From 0f801c15874b16217cd78745d4773f2b741a2dce Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 19 Sep 2023 11:24:58 -0400 Subject: [PATCH 13/28] NUTCH-2959 -- downgrade commons-io to match the version we expect to come out with Hadoop 3.4.0. --- ivy/ivy.xml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 2ef3599ab3..b391649ea7 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -45,7 +45,9 @@ - + + From 5be64d2dad755f55980a1ea767abfb8e9fcc808a Mon Sep 17 00:00:00 2001 From: tballison Date: Mon, 25 Sep 2023 09:09:20 -0400 Subject: [PATCH 14/28] NUTCH-3004 -- propagate ssl exception if message doesn't match "handshake alert..." --- .../src/java/org/apache/nutch/protocol/http/HttpResponse.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index 260a7c19c2..48918dc514 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -150,6 +150,10 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum datum) + e.getMessage(); throw new HttpException(msg); } + } else { + String msg = "SSL connect to " + url + " failed with: " + + e.getMessage(); + throw new HttpException(msg, e); } } socket = sslsocket; From 417b8773231136eb48957f743c2bc3c21f624d4e Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 28 Sep 2023 12:05:50 +0200 Subject: [PATCH 15/28] NUTCH-2852 SpotBugs: Method invokes System.exit(...) - remove all calls of System.exit(...) in methods except main(args) of various "checker" tools --- .../org/apache/nutch/indexer/IndexingFiltersChecker.java | 4 ++-- src/java/org/apache/nutch/net/URLFilterChecker.java | 4 ++-- src/java/org/apache/nutch/net/URLNormalizerChecker.java | 4 ++-- src/java/org/apache/nutch/parse/ParserChecker.java | 4 ++-- src/java/org/apache/nutch/util/AbstractChecker.java | 9 ++++----- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java index 3aa7a05cba..1931c360d8 100644 --- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java +++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java @@ -93,7 +93,7 @@ public int run(String[] args) throws Exception { // Print help when no args given if (args.length < 1) { System.err.println(usage); - System.exit(-1); + return -1; } // read property "doIndex" for back-ward compatibility @@ -126,7 +126,7 @@ public int run(String[] args) throws Exception { } else if (i != args.length - 1) { System.err.println("ERR: Not a recognized argument: " + args[i]); System.err.println(usage); - System.exit(-1); + return -1; } else { url = args[i]; } diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java index 7916cc5794..821f2e9267 100644 --- a/src/java/org/apache/nutch/net/URLFilterChecker.java +++ b/src/java/org/apache/nutch/net/URLFilterChecker.java @@ -41,7 +41,7 @@ public int run(String[] args) throws Exception { // Print help when no args given if (args.length < 1) { System.err.println(usage); - System.exit(-1); + return -1; } int numConsumed; @@ -53,7 +53,7 @@ public int run(String[] args) throws Exception { } else { System.err.println("ERROR: Not a recognized argument: " + args[i]); System.err.println(usage); - System.exit(-1); + return -1; } } diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java index 586c7b2460..46fdd38cfb 100644 --- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java +++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java @@ -44,7 +44,7 @@ public int run(String[] args) throws Exception { // Print help when no args given if (args.length < 1) { System.err.println(usage); - System.exit(-1); + return -1; } int numConsumed; @@ -58,7 +58,7 @@ public int run(String[] args) throws Exception { } else { System.err.println("ERROR: Not a recognized argument: " + args[i]); System.err.println(usage); - System.exit(-1); + return -1; } } diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 1533ab57cc..10eec4b244 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -104,7 +104,7 @@ public int run(String[] args) throws Exception { // Print help when no args given if (args.length < 1) { System.err.println(usage); - System.exit(-1); + return -1; } // initialize plugins early to register URL stream handlers to support @@ -138,7 +138,7 @@ public int run(String[] args) throws Exception { } else if (i != args.length - 1) { System.err.println("ERR: Not a recognized argument: " + args[i]); System.err.println(usage); - System.exit(-1); + return -1; } else { url = args[i]; } diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java b/src/java/org/apache/nutch/util/AbstractChecker.java index 3116ede146..1374812250 100644 --- a/src/java/org/apache/nutch/util/AbstractChecker.java +++ b/src/java/org/apache/nutch/util/AbstractChecker.java @@ -72,8 +72,7 @@ protected int parseArgs(String[] args, int i) { protected int run() throws Exception { // In listening mode? if (tcpPort != -1) { - processTCP(tcpPort); - return 0; + return processTCP(tcpPort); } else if (stdin) { return processStdin(); } @@ -104,7 +103,7 @@ protected int processStdin() throws Exception { // Open TCP socket and process input @SuppressWarnings("resource") - protected void processTCP(int tcpPort) throws Exception { + protected int processTCP(int tcpPort) throws Exception { ServerSocket server = null; try { @@ -113,7 +112,7 @@ protected void processTCP(int tcpPort) throws Exception { LOG.info(server.toString()); } catch (Exception e) { LOG.error("Could not listen on port " + tcpPort, e); - System.exit(-1); + return -1; } while(true){ @@ -124,7 +123,7 @@ protected void processTCP(int tcpPort) throws Exception { thread.start(); } catch (Exception e) { LOG.error("Accept failed: " + tcpPort, e); - System.exit(-1); + return -1; } } } From a72a53a32d2183f8a8baefbd50afd007279e4857 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 28 Sep 2023 12:26:29 +0200 Subject: [PATCH 16/28] NUTCH-3007 Fix impossible casts - remove code blocks (else clauses) unneeded and containing impossible casts --- src/java/org/apache/nutch/fetcher/Fetcher.java | 13 ++----------- src/java/org/apache/nutch/parse/ParseSegment.java | 13 ++----------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 7cc87f40c6..3727dcebef 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -598,20 +598,11 @@ public Map run(Map args, String crawlId) throws Path segment = null; if(args.containsKey(Nutch.ARG_SEGMENTS)) { Object seg = args.get(Nutch.ARG_SEGMENTS); - if(seg instanceof Path) { + if (seg instanceof Path) { segment = (Path) seg; - } - else if(seg instanceof String){ + } else if (seg instanceof String) { segment = new Path(seg.toString()); } - else if(seg instanceof ArrayList) { - String[] segmentsArray = (String[])seg; - segment = new Path(segmentsArray[0].toString()); - - if(segmentsArray.length > 1){ - LOG.warn("Only the first segment of segments array is used."); - } - } } else { String segmentDir = crawlId+"/segments"; diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index 7e4707d399..c4e271feec 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -312,20 +312,11 @@ public Map run(Map args, String crawlId) throws Path segment = null; if(args.containsKey(Nutch.ARG_SEGMENTS)) { Object seg = args.get(Nutch.ARG_SEGMENTS); - if(seg instanceof Path) { + if (seg instanceof Path) { segment = (Path) seg; - } - else if(seg instanceof String){ + } else if (seg instanceof String) { segment = new Path(seg.toString()); } - else if(seg instanceof ArrayList) { - String[] segmentsArray = (String[])seg; - segment = new Path(segmentsArray[0].toString()); - - if(segmentsArray.length > 1){ - LOG.warn("Only the first segment of segments array is used."); - } - } } else { String segment_dir = crawlId+"/segments"; From 810b1d6ad50fa9021469b4ca5e1db9050a3263c5 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 30 Sep 2023 08:09:18 +0200 Subject: [PATCH 17/28] NUTCH-3010 Injector: count unique number of injected URLs - add counter urls_injected_unique - improve log messages reporting the counts of injected/merged URLs --- src/java/org/apache/nutch/crawl/Injector.java | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index b93e8ca76a..9fca719f62 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -341,8 +341,11 @@ public void reduce(Text key, Iterable values, Context context) ? injected.getFetchInterval() : old.getFetchInterval()); } } - if (injectedSet && oldSet) { - context.getCounter("injector", "urls_merged").increment(1); + if (injectedSet) { + context.getCounter("injector", "urls_injected_unique").increment(1); + if (oldSet) { + context.getCounter("injector", "urls_merged").increment(1); + } } context.write(key, result); } @@ -448,22 +451,24 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, if (LOG.isInfoEnabled()) { long urlsInjected = job.getCounters() .findCounter("injector", "urls_injected").getValue(); + long urlsInjectedUniq = job.getCounters() + .findCounter("injector", "urls_injected_unique").getValue(); long urlsFiltered = job.getCounters() .findCounter("injector", "urls_filtered").getValue(); long urlsMerged = job.getCounters() .findCounter("injector", "urls_merged").getValue(); - long urlsPurged404= job.getCounters() + long urlsPurged404 = job.getCounters() .findCounter("injector", "urls_purged_404").getValue(); - long urlsPurgedFilter= job.getCounters() + long urlsPurgedFilter = job.getCounters() .findCounter("injector", "urls_purged_filter").getValue(); - LOG.info("Injector: Total urls rejected by filters: " + urlsFiltered); + LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered); LOG.info( - "Injector: Total urls injected after normalization and filtering: " - + urlsInjected); - LOG.info("Injector: Total urls injected but already in CrawlDb: " - + urlsMerged); - LOG.info("Injector: Total new urls injected: " - + (urlsInjected - urlsMerged)); + "Injector: Total urls injected after normalization and filtering: {} (unique URLs: {})", + urlsInjected, urlsInjectedUniq); + LOG.info("Injector: Total urls injected but already in CrawlDb: {}", + urlsMerged); + LOG.info("Injector: Total new urls injected: {}", + (urlsInjectedUniq - urlsMerged)); if (filterNormalizeAll) { LOG.info("Injector: Total urls removed from CrawlDb by filters: {}", urlsPurgedFilter); @@ -475,8 +480,8 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, } long end = System.currentTimeMillis(); - LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end), + TimingUtil.elapsedTime(start, end)); } } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) { LOG.error("Injector job failed: {}", e.getMessage()); From a1ab4333e0a1a28ac2e0f9c75871f7feeb5f2f81 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 30 Sep 2023 11:12:07 +0200 Subject: [PATCH 18/28] NUTCH-2897 Do not supress deprecated API warnings - deprecate constructor of NutchJob - remove deprocated call to Object.finalize() from Plugin.finalize() --- src/java/org/apache/nutch/plugin/Plugin.java | 2 -- src/java/org/apache/nutch/util/NutchJob.java | 13 ++++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/java/org/apache/nutch/plugin/Plugin.java b/src/java/org/apache/nutch/plugin/Plugin.java index b2e717d20e..3a0fb2e915 100644 --- a/src/java/org/apache/nutch/plugin/Plugin.java +++ b/src/java/org/apache/nutch/plugin/Plugin.java @@ -90,9 +90,7 @@ private void setDescriptor(PluginDescriptor descriptor) { } @Override - @SuppressWarnings("deprecation") protected void finalize() throws Throwable { - super.finalize(); shutDown(); } } diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java index 478b24f89e..068c64fefb 100644 --- a/src/java/org/apache/nutch/util/NutchJob.java +++ b/src/java/org/apache/nutch/util/NutchJob.java @@ -35,7 +35,18 @@ public class NutchJob extends Job { private static final String JOB_FAILURE_LOG_FORMAT = "%s job did not succeed, job id: %s, job status: %s, reason: %s"; - @SuppressWarnings("deprecation") + /** + * @deprecated, use instead {@link #getInstance(Configuration)} or + * {@link Job#getInstance(Configuration, String)}. + * + * @param conf + * configuration for the job + * @param jobName + * name of the job + * @throws IOException + * see {@link Job#Job(Configuration, String)} + */ + @Deprecated public NutchJob(Configuration conf, String jobName) throws IOException { super(conf, jobName); if (conf != null) { From a74b57b90409b9488caa169e7bc3c6d1ff8067f4 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 30 Sep 2023 11:16:59 +0200 Subject: [PATCH 19/28] NUTCH-2853 bin/nutch: remove deprecated commands solrindex, solrdedup, solrclean --- src/bin/nutch | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/bin/nutch b/src/bin/nutch index 5b999fa6f5..561c79e778 100755 --- a/src/bin/nutch +++ b/src/bin/nutch @@ -81,9 +81,6 @@ if [ $# = 0 ]; then echo " dedup deduplicate entries in the crawldb and give them a special status" echo " dump exports crawled data from segments into files" echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR" - echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead" - echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead" - echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead" echo " clean remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins" echo " parsechecker check the parser for a given url" echo " indexchecker check the indexing filters for a given url" @@ -253,19 +250,14 @@ elif [ "$COMMAND" = "dump" ] ; then CLASS=org.apache.nutch.tools.FileDumper elif [ "$COMMAND" = "commoncrawldump" ] ; then CLASS=org.apache.nutch.tools.CommonCrawlDataDumper -elif [ "$COMMAND" = "solrindex" ] ; then - CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1" - shift +elif [ "$COMMAND" = "solrindex" ] || [ "$COMMAND" = "solrdedup" ] || [ "$COMMAND" = "solrclean" ]; then + REPLACEMENT="${COMMAND#solr}" + echo "The command $COMMAND was replaced by the command $REPLACEMENT" + exit -1 elif [ "$COMMAND" = "index" ] ; then CLASS=org.apache.nutch.indexer.IndexingJob -elif [ "$COMMAND" = "solrdedup" ] ; then - echo "Command $COMMAND is deprecated, please use dedup instead" - exit -1 elif [ "$COMMAND" = "dedup" ] ; then CLASS=org.apache.nutch.crawl.DeduplicationJob -elif [ "$COMMAND" = "solrclean" ] ; then - CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1" - shift; shift elif [ "$COMMAND" = "clean" ] ; then CLASS=org.apache.nutch.indexer.CleaningJob elif [ "$COMMAND" = "parsechecker" ] ; then From 9faf364a7fa1631f553a36b8234c1169eba0f5c3 Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 3 Oct 2023 10:48:09 -0400 Subject: [PATCH 20/28] Working now locally and with Seb's single_node_cluster tests --- ivy/ivy.xml | 2 +- ivy/ivysettings.xml | 7 ++ src/plugin/language-identifier/ivy.xml | 8 +-- src/plugin/language-identifier/plugin.xml | 11 +-- src/plugin/parse-tika/ivy.xml | 19 +---- src/plugin/parse-tika/plugin.xml | 84 +---------------------- 6 files changed, 12 insertions(+), 119 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index b391649ea7..6f39262449 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -70,7 +70,7 @@ - + diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml index 18038a5ca4..a060df5b69 100644 --- a/ivy/ivysettings.xml +++ b/ivy/ivysettings.xml @@ -32,7 +32,14 @@ + + + + + - - - - - - - + diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml index 28cfd70317..dab1a52f31 100644 --- a/src/plugin/language-identifier/plugin.xml +++ b/src/plugin/language-identifier/plugin.xml @@ -26,16 +26,7 @@ - - - - - - - - - - + diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index d4718ea498..1586d9661f 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -36,24 +36,7 @@ - - - - - - - - - - - - - - - - - + diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml index 3bc0a822cb..dd4fe7fde8 100644 --- a/src/plugin/parse-tika/plugin.xml +++ b/src/plugin/parse-tika/plugin.xml @@ -25,89 +25,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + From 9aabc459a4525f1f50f4597cf39599441403cc69 Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 3 Oct 2023 11:02:54 -0400 Subject: [PATCH 21/28] update howto_upgrade_tika.txt --- src/plugin/parse-tika/howto_upgrade_tika.txt | 22 ++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt index 2dcf76c455..46d075948b 100644 --- a/src/plugin/parse-tika/howto_upgrade_tika.txt +++ b/src/plugin/parse-tika/howto_upgrade_tika.txt @@ -1,3 +1,25 @@ +We are currently using a shim (https://github.com/tballison/hadoop-safe-tika +because of binary conflicts in commons-io versions between what Hadoop supports and the more +modern features that Apache Tika and Apache POI were using in commons-io. + +For now, all you have to do is update the fat jar dependencies: + +1. tika-core-shaded in ivy/ivy.xml + +2. tika-parsers-standard-package-shaded in src/plugin/parse-tika/ivy.xml + +3. The library name version for tika-parsers-standard-package-shaded in src/plugin/parse-tika/plugin.xml + +4. Repeat steps 2 and 3 for the language-identifier + +5. Build Nutch and run all unit tests: + + $ cd ../../../ + $ ant clean runtime test + +The following directions are what we used to do with thin jars. Hopefully, we'll +be able to get back to these directions once we have version harmony with Hadoop and Tika/POI. + 1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml 2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml From e96cfc56ee04c8e7e07e11d4eef521b4674a9ec6 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 19 Sep 2023 08:10:14 +0200 Subject: [PATCH 22/28] NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header metadata lookup should be case-insensitive - implement class CaseInsensitiveMetadata providing case-insensitive metadata look-ups (but no spell-checking) - use CaseInsensitiveMetadata to hold HTTP header metadata in in the class OkHttpResponse of protocol-okhttp - add unit tests to prove the fix (and also case-insensitive look-ups and spell-checking in protocol-http) --- .../metadata/CaseInsensitiveMetadata.java | 33 ++++ .../org/apache/nutch/metadata/Metadata.java | 4 +- .../nutch/metadata/SpellCheckedMetadata.java | 8 +- .../apache/nutch/net/protocols/Response.java | 2 +- .../nutch/protocol/http/TestResponse.java | 152 +++++++++++++++++ .../nutch/protocol/okhttp/OkHttpResponse.java | 3 +- .../nutch/protocol/okhttp/TestResponse.java | 154 ++++++++++++++++++ 7 files changed, 348 insertions(+), 8 deletions(-) create mode 100644 src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java create mode 100644 src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java create mode 100644 src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java diff --git a/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java new file mode 100644 index 0000000000..92e848ca2d --- /dev/null +++ b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import java.util.TreeMap; + +/** + * A decorator to Metadata that adds for case-insensitive lookup of keys. + */ +public class CaseInsensitiveMetadata extends Metadata { + + /** + * Constructs a new, empty metadata. + */ + public CaseInsensitiveMetadata() { + metadata = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + } + +} diff --git a/src/java/org/apache/nutch/metadata/Metadata.java b/src/java/org/apache/nutch/metadata/Metadata.java index 5c37911fb9..7fa0bb12ce 100644 --- a/src/java/org/apache/nutch/metadata/Metadata.java +++ b/src/java/org/apache/nutch/metadata/Metadata.java @@ -36,7 +36,7 @@ public class Metadata implements Writable, CreativeCommons, DublinCore, /** * A map of all metadata attributes. */ - private Map metadata = null; + protected Map metadata = null; /** * Constructs a new, empty metadata. @@ -66,7 +66,7 @@ public String[] names() { } /** - * Get the value associated to a metadata name. If many values are assiociated + * Get the value associated to a metadata name. If many values are associated * to the specified name, then the first one is returned. * * @param name diff --git a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java index fdbf1b62c8..be161440e2 100644 --- a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java +++ b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java @@ -25,7 +25,7 @@ /** * A decorator to Metadata that adds spellchecking capabilities to property - * names. Currently used spelling vocabulary contains just the httpheaders from + * names. Currently used spelling vocabulary contains just the HTTP headers from * {@link HttpHeaders} class. * */ @@ -94,7 +94,7 @@ private static String normalize(final String str) { /** * Get the normalized name of metadata attribute name. This method tries to * find a well-known metadata name (one of the metadata names defined in this - * class) that matches the specified name. The matching is error tolerent. For + * class) that matches the specified name. The matching is error tolerant. For * instance, *
    *
  • content-type gives Content-Type
  • @@ -105,8 +105,8 @@ private static String normalize(final String str) { * name is returned. * * @param name - * Name to normalize - * @return normalized name + * HTTP header name to normalize + * @return normalized HTTP header name */ public static String getNormalizedName(final String name) { String searched = normalize(name); diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 0159358ec0..514ce85613 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -86,7 +86,7 @@ public static enum TruncatedContentReason { /** * Get the value of a named header. - * @param name key of the header you wish to retreive + * @param name key of the header you wish to retrieve * @return header value */ public String getHeader(String name); diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java new file mode 100644 index 0000000000..9d65b6df88 --- /dev/null +++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest; +import org.apache.nutch.protocol.ProtocolException; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; + +public class TestResponse extends AbstractHttpProtocolPluginTest { + + protected static final String redirectHeader = "HTTP/1.1 301 Moved Permanently\r\n" // + + "Content-Type: text/html; charset=UTF-8\r\n" // + + "Content-Length: 0\r\n"; + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + @Override + protected String getPluginClassName() { + return "org.apache.nutch.protocol.okhttp.OkHttp"; + } + + @Override + @Before + public void setUp() throws Exception { + conf = new Configuration(); + conf.addResource("nutch-default.xml"); + /* + * plugin tests specific config file - needs to add the tested plugin to + * plugin.includes + */ + conf.addResource("nutch-site-test.xml"); + conf.setBoolean("store.http.headers", true); + + http = new Http(); + http.setConf(conf); + } + + protected HttpResponse getResponse(int statusCode, String headerName) { + try { + URL url = new URL(protocol, localHost, defaultPort, "/" + headerName); + LOG.info("Emulating fetch of {}", url); + return new HttpResponse((Http) http, url, new CrawlDatum(statusCode, 1000)); + } catch (ProtocolException | IOException e) { + return null; + } + } + + protected void headerTest(int statusCode, String headerName, String value, String lookupName) { + HttpResponse response = getResponse(statusCode, headerName); + LOG.info("Response headers:"); + LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS)); + assertEquals( + "No or unexpected value of header \"" + headerName + + "\" returned when retrieving header \"" + lookupName + "\"", + value, response.getHeader(lookupName)); + } + + protected Map getResponses(String headerValue) { + String[] headerNames = { "Location", "location", "LOCATION", "Loction" }; + Map responses = new TreeMap<>(); + for (String headerName : headerNames) { + responses.put("/" + headerName, + (redirectHeader + headerName + ": " + headerValue + "\r\n" + + "Content-Length: 0\r\n\r\n").getBytes(UTF_8)); + } + responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": " + + headerValue + "\r\n" + simpleContent).getBytes(UTF_8)); + return responses; + } + + @Test + public void testGetHeader() throws Exception { + String value = "headervalue"; + launchServer(getResponses(value)); + + LOG.info( + "Testing standard HTTP header \"Location\": expected case-insensitive and error-tolerant matching"); + headerTest(301, "Location", value, "Location"); + headerTest(301, "Location", value, "location"); + headerTest(301, "location", value, "Location"); + headerTest(301, "LOCATION", value, "Location"); + headerTest(301, "Loction", value, "Location"); + + LOG.info( + "Testing non-standard HTTP header \"MyCustomHeader\": only exact matching"); + headerTest(200, "MyCustomHeader", value, "MyCustomHeader"); + /* + * The following case-insensitive or approximate look-ups are not supported + * for non-standard headers by SpellCheckedMetadata: + */ + // testHeader(200, "MyCustomHeader", value, "mycustomheader"); + // testHeader(200, "mycustomheader", value, "MyCustomHeader"); + // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader"); + } + + @Ignore("Only for benchmarking") + @Test + public void testMetadataBenchmark() throws MalformedURLException, ProtocolException, + IOException, InterruptedException { + String value = "headervalue"; + launchServer(getResponses(value)); + Thread.sleep(30000); // time to attach a profiler + int iterations = 4000; + LOG.info("Starting benchmark with {} iterations ({} calls)", iterations, + (iterations * 5)); + long start = System.currentTimeMillis(); + for (int i = 0; i < iterations; i++) { + headerTest(301, "Location", value, "Location"); + headerTest(301, "Location", value, "location"); + headerTest(301, "location", value, "Location"); + headerTest(301, "LOCATION", value, "Location"); + headerTest(301, "Loction", value, "Location"); + } + long elapsed = System.currentTimeMillis() - start; + LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed, + (elapsed / (5.0 * iterations))); + } + +} diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 67bc45b035..605c03390f 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -24,6 +24,7 @@ import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.CaseInsensitiveMetadata; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; @@ -106,7 +107,7 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) // ensure that Response and underlying ResponseBody are closed try (okhttp3.Response response = call.execute()) { - Metadata responsemetadata = new Metadata(); + Metadata responsemetadata = new CaseInsensitiveMetadata(); okhttp3.Headers httpHeaders = response.headers(); for (int i = 0, size = httpHeaders.size(); i < size; i++) { diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java new file mode 100644 index 0000000000..695a6c539c --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest; +import org.apache.nutch.protocol.ProtocolException; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; + +public class TestResponse extends AbstractHttpProtocolPluginTest { + + protected static final String redirectHeader = "HTTP/1.1 301 Moved Permanently\r\n" // + + "Content-Type: text/html; charset=UTF-8\r\n" // + + "Content-Length: 0\r\n"; + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + @Override + protected String getPluginClassName() { + return "org.apache.nutch.protocol.okhttp.OkHttp"; + } + + @Override + @Before + public void setUp() throws Exception { + conf = new Configuration(); + conf.addResource("nutch-default.xml"); + /* + * plugin tests specific config file - needs to add the tested plugin to + * plugin.includes + */ + conf.addResource("nutch-site-test.xml"); + conf.setBoolean("store.http.headers", true); + + http = new OkHttp(); + http.setConf(conf); + } + + protected OkHttpResponse getResponse(int statusCode, String headerName) { + try { + URL url = new URL(protocol, localHost, defaultPort, "/" + headerName); + LOG.info("Emulating fetch of {}", url); + return new OkHttpResponse((OkHttp) http, url, new CrawlDatum(statusCode, 1000)); + } catch (ProtocolException | IOException e) { + return null; + } + } + + protected void headerTest(int statusCode, String headerName, String value, String lookupName) { + OkHttpResponse response = getResponse(statusCode, headerName); + LOG.info("Response headers:"); + LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS)); + assertEquals( + "No or unexpected value of header \"" + headerName + + "\" returned when retrieving header \"" + lookupName + "\"", + value, response.getHeader(lookupName)); + } + + protected Map getResponses(String headerValue) { + String[] headerNames = { "Location", "location", "LOCATION", "Loction" }; + Map responses = new TreeMap<>(); + for (String headerName : headerNames) { + responses.put("/" + headerName, + (redirectHeader + headerName + ": " + headerValue + "\r\n" + + "Content-Length: 0\r\n\r\n").getBytes(UTF_8)); + } + responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": " + + headerValue + "\r\n" + simpleContent).getBytes(UTF_8)); + return responses; + } + + @Test + public void testGetHeader() throws Exception { + String value = "headervalue"; + launchServer(getResponses(value)); + + LOG.info( + "Testing standard HTTP header \"Location\": expected case-insensitive and error-tolerant matching"); + headerTest(301, "Location", value, "Location"); + headerTest(301, "Location", value, "location"); + headerTest(301, "location", value, "Location"); + headerTest(301, "LOCATION", value, "Location"); + // only with SpellCheckedMetadata: + // headerTest(301, "Loction", value, "Location"); + + LOG.info( + "Testing non-standard HTTP header \"MyCustomHeader\": only exact matching"); + headerTest(200, "MyCustomHeader", value, "MyCustomHeader"); + /* + * The following case-insensitive or approximate look-ups are not supported + * for non-standard headers by SpellCheckedMetadata: + */ + // testHeader(200, "MyCustomHeader", value, "mycustomheader"); + // testHeader(200, "mycustomheader", value, "MyCustomHeader"); + // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader"); + } + + @Ignore("Only for benchmarking") + @Test + public void testMetadataBenchmark() throws MalformedURLException, ProtocolException, + IOException, InterruptedException { + String value = "headervalue"; + launchServer(getResponses(value)); + Thread.sleep(30000); // time to attach a profiler + int iterations = 5000; + LOG.info("Starting benchmark with {} iterations ({} calls)", iterations, + (iterations * 4)); + long start = System.currentTimeMillis(); + for (int i = 0; i < iterations; i++) { + headerTest(301, "Location", value, "Location"); + headerTest(301, "Location", value, "location"); + headerTest(301, "location", value, "Location"); + headerTest(301, "LOCATION", value, "Location"); + // only with SpellCheckedMetadata: + // headerTest(301, "Loction", value, "Location"); + } + long elapsed = System.currentTimeMillis() - start; + LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed, + (elapsed / (4.0 * iterations))); + } + +} From bb68385f9601b37c61ef5a2baac58740c975bddb Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 28 Sep 2023 14:53:02 +0200 Subject: [PATCH 23/28] NUTCH-3009 Upgrade to Hadoop 3.3.6 --- default.properties | 2 +- ivy/ivy.xml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/default.properties b/default.properties index 17e0bffbbc..06f2ed0096 100644 --- a/default.properties +++ b/default.properties @@ -44,7 +44,7 @@ test.junit.output.format = plain javadoc.proxy.host=-J-DproxyHost= javadoc.proxy.port=-J-DproxyPort= javadoc.link.java=https://docs.oracle.com/en/java/javase/11/docs/api/ -javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.3.4/api/ +javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.3.6/api/ javadoc.packages=org.apache.nutch.* dist.dir=./dist diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 6f39262449..e5ae3882f5 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -53,19 +53,19 @@ - + - + - + - + From ecdd19dbdd4424bf9b9bce206f23992140ee43fe Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 21 Oct 2023 15:53:25 +0200 Subject: [PATCH 24/28] NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as specified by RFC 9309 (#779) - follow multiple redirects when fetching robots.txt - number of followed redirects is configurable by the property http.robots.redirect.max (default: 5) Improvements to RobotRulesParser's robots.txt test utility - bug fix: the passed agent names need to be transferred to the property http.robots.agents earlier, before the protocol plugins are configured - more verbose debug logging --- conf/nutch-default.xml | 10 ++ .../nutch/protocol/RobotRulesParser.java | 32 ++-- .../http/api/HttpRobotRulesParser.java | 141 ++++++++++++++---- 3 files changed, 143 insertions(+), 40 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 58455b338c..18ed56b037 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -163,6 +163,16 @@ + + http.robots.redirect.max + 5 + Maximum number of redirects followed when fetching + a robots.txt file. RFC 9309 specifies that "crawlers SHOULD + follow at least five consecutive redirects, even across authorities + (for example, hosts in the case of HTTP)." + + + http.agent.description diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java index 562c2c694f..d73c075060 100644 --- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java +++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java @@ -98,6 +98,7 @@ public abstract class RobotRulesParser implements Tool { protected Configuration conf; protected Set agentNames; + protected int maxNumRedirects = 5; /** set of host names or IPs to be explicitly excluded from robots.txt checking */ protected Set allowList = new HashSet<>(); @@ -149,6 +150,10 @@ public void setConf(Configuration conf) { } } } + LOG.info("Checking robots.txt for the following agent names: {}", agentNames); + + maxNumRedirects = conf.getInt("http.robots.redirect.max", 5); + LOG.info("Following max. {} robots.txt redirects", maxNumRedirects); String[] confAllowList = conf.getStrings("http.robot.rules.allowlist"); if (confAllowList == null) { @@ -294,8 +299,11 @@ public int run(String[] args) { "", "\tlocal file or URL parsed as robots.txt file", "\tIf starts with a protocol specification", - "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched", - "\tusing the specified protocol. Otherwise, a local file is assumed.", + "\t(`http', `https', `ftp' or `file'), the URL is parsed, URL path", + "\tand query are removed and the path \"/robots.txt\" is appended.", + "\tThe resulting URL (the canonical robots.txt location) is then", + "\tfetched using the specified protocol.", + "\tIf the URL does not include a protocol, a local file is assumed.", "", "\tlocal file with URLs (one per line), for every URL", "\tthe path part (including the query) is checked whether", @@ -323,6 +331,16 @@ public int run(String[] args) { return -1; } + if (args.length > 2) { + // set agent name from command-line in configuration + // Note: when fetching via protocol this must be done + // before the protocol is configured + String agents = args[2]; + conf.set("http.robots.agents", agents); + conf.set("http.agent.name", agents.split(",")[0]); + setConf(conf); + } + Protocol protocol = null; URL robotsTxtUrl = null; if (args[0].matches("^(?:https?|ftp|file)://?.*")) { @@ -334,6 +352,7 @@ public int run(String[] args) { ProtocolFactory factory = new ProtocolFactory(conf); try { protocol = factory.getProtocol(robotsTxtUrl); + LOG.debug("Using protocol {} to fetch robots.txt", protocol.getClass()); } catch (ProtocolNotFound e) { LOG.error("No protocol found for {}: {}", args[0], StringUtils.stringifyException(e)); @@ -357,14 +376,6 @@ public int run(String[] args) { File urlFile = new File(args[1]); - if (args.length > 2) { - // set agent name from command-line in configuration and update parser - String agents = args[2]; - conf.set("http.robots.agents", agents); - conf.set("http.agent.name", agents.split(",")[0]); - setConf(conf); - } - List robotsTxtContent = null; if (getConf().getBoolean("fetcher.store.robotstxt", false)) { robotsTxtContent = new LinkedList<>(); @@ -373,6 +384,7 @@ public int run(String[] args) { try { BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent); + LOG.debug("Robots.txt rules:\n{}", rules); if (robotsTxtContent != null) { for (Content robotsTxt : robotsTxtContent) { diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index db09a0c880..8d7263e3ea 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -17,12 +17,15 @@ package org.apache.nutch.protocol.http.api; import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; import java.net.URL; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.net.protocols.Response; @@ -87,6 +90,13 @@ protected static String getCacheKey(URL url) { * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the * rules are cached to avoid re-fetching and re-parsing it again. * + *

    Following + * RFC + * 9309, section 2.3.1.2. Redirects, up to five consecutive HTTP redirects + * are followed when fetching the robots.txt file. The max. number of + * redirects followed is configurable by the property + * http.robots.redirect.max.

    + * * @param http * The {@link Protocol} object * @param url @@ -114,11 +124,11 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, if (robotRules != null) { return robotRules; // cached rule } else if (LOG.isTraceEnabled()) { - LOG.trace("cache miss {}", url); + LOG.trace("Robots.txt cache miss {}", url); } boolean cacheRule = true; - URL redir = null; + Set redirectCacheKeys = new HashSet<>(); if (isAllowListed(url)) { // check in advance whether a host is allowlisted @@ -129,43 +139,97 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, url.getHost()); } else { + URL robotsUrl = null, robotsUrlRedir = null; try { - URL robotsUrl = new URL(url, "/robots.txt"); + robotsUrl = new URL(url, "/robots.txt"); + + /* + * Redirect counter - following redirects up to the configured maximum + * ("five consecutive redirects" as per RFC 9309). + */ + int numRedirects = 0; + /* + * The base URL to resolve relative redirect locations is set initially + * to the default URL path ("/robots.txt") and updated when redirects + * were followed. + */ + robotsUrlRedir = robotsUrl; + Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true); + int code = response.getCode(); if (robotsTxtContent != null) { addRobotsContent(robotsTxtContent, robotsUrl, response); } - // try one level of redirection ? - if (response.getCode() == 301 || response.getCode() == 302) { - String redirection = response.getHeader("Location"); - if (redirection == null) { - // some versions of MS IIS are known to mangle this header - redirection = response.getHeader("location"); + + while (isRedirect(code) && numRedirects < maxNumRedirects) { + numRedirects++; + + String redirectionLocation = response.getHeader("Location"); + if (StringUtils.isNotBlank(redirectionLocation)) { + LOG.debug("Following robots.txt redirect: {} -> {}", robotsUrlRedir, + redirectionLocation); + try { + robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation); + } catch (MalformedURLException e) { + LOG.info( + "Failed to resolve redirect location for robots.txt: {} -> {} ({})", + robotsUrlRedir, redirectionLocation, e.getMessage()); + break; + } + response = ((HttpBase) http).getResponse(robotsUrlRedir, + new CrawlDatum(), true); + code = response.getCode(); + if (robotsTxtContent != null) { + addRobotsContent(robotsTxtContent, robotsUrlRedir, response); + } + } else { + LOG.info( + "No HTTP redirect Location header for robots.txt: {} (status code: {})", + robotsUrlRedir, code); + break; } - if (redirection != null) { - if (!redirection.startsWith("http")) { - // RFC says it should be absolute, but apparently it isn't - redir = new URL(url, redirection); + + if ("/robots.txt".equals(robotsUrlRedir.getFile())) { + /* + * If a redirect points to a path /robots.txt on a different host + * (or a different authority scheme://host:port/, in general), we + * can lookup the cache for cached rules from the target host. + */ + String redirectCacheKey = getCacheKey(robotsUrlRedir); + robotRules = CACHE.get(redirectCacheKey); + LOG.debug( + "Found cached robots.txt rules for {} (redirected to {}) under target key {}", + url, robotsUrlRedir, redirectCacheKey); + if (robotRules != null) { + /* If found, cache and return the rules for the source host. */ + CACHE.put(cacheKey, robotRules); + return robotRules; } else { - redir = new URL(redirection); + /* + * Remember the target host/authority, we can cache the rules, + * too. + */ + redirectCacheKeys.add(redirectCacheKey); } + } - response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), - true); - if (robotsTxtContent != null) { - addRobotsContent(robotsTxtContent, redir, response); - } + if (numRedirects == maxNumRedirects && isRedirect(code)) { + LOG.info( + "Reached maximum number of robots.txt redirects for {} (assuming no robots.txt, allow all)", + url); } } - if (response.getCode() == 200) // found rules: parse them + LOG.debug("Fetched robots.txt for {} with status code {}", url, code); + if (code == 200) // found rules: parse them robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames); - else if ((response.getCode() == 403) && (!allowForbidden)) + else if ((code == 403) && (!allowForbidden)) robotRules = FORBID_ALL_RULES; // use forbid all - else if (response.getCode() >= 500) { + + else if (code >= 500) { cacheRule = false; // try again later to fetch robots.txt if (deferVisits503) { // signal fetcher to suspend crawling for this host @@ -177,8 +241,15 @@ else if (response.getCode() >= 500) { robotRules = EMPTY_RULES; // use default rules } } catch (Throwable t) { - if (LOG.isInfoEnabled()) { - LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); + if (robotsUrl == null || robotsUrlRedir == null) { + LOG.info("Couldn't get robots.txt for {}", url, t); + } else if (robotsUrl.equals(robotsUrlRedir)) { + LOG.info("Couldn't get robots.txt for {} ({}): {}", url, robotsUrl, + t); + } else { + LOG.info( + "Couldn't get redirected robots.txt for {} (redirected to {}): {}", + url, robotsUrlRedir, t); } cacheRule = false; // try again later to fetch robots.txt robotRules = EMPTY_RULES; @@ -187,17 +258,27 @@ else if (response.getCode() >= 500) { if (cacheRule) { CACHE.put(cacheKey, robotRules); // cache rules for host - if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost()) - && "/robots.txt".equals(redir.getFile())) { - // cache also for the redirected host - // if the URL path is /robots.txt - CACHE.put(getCacheKey(redir), robotRules); + for (String redirectCacheKey : redirectCacheKeys) { + /* + * and also for redirect target hosts where URL path and query were + * found to be "/robots.txt" + */ + CACHE.put(redirectCacheKey, robotRules); } } return robotRules; } + /** + * @param code + * HTTP response status code + * @return whether the status code signals a redirect to a different location + */ + private boolean isRedirect(int code) { + return (code == 301 || code == 302 || code == 303 || code == 307 || code == 308); + } + /** * Append {@link Content} of robots.txt to {@literal robotsTxtContent} * From b081c75d87be61e42297c952298b72eb7ff2a6dc Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 1 Oct 2023 14:08:39 +0200 Subject: [PATCH 25/28] NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too Many Requests same as server errors (HTTP 5xx) --- conf/nutch-default.xml | 11 ++++++----- .../nutch/protocol/http/api/HttpRobotRulesParser.java | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 18ed56b037..d8bf76486c 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -141,8 +141,9 @@ http.robots.503.defer.visits true Temporarily suspend fetching from a host if the - robots.txt response is HTTP 503 or any other 5xx server error. See - also http.robots.503.defer.visits.delay and + robots.txt response is HTTP 503 or any other 5xx server error + and HTTP 429 Too Many Requests. See also + http.robots.503.defer.visits.delay and http.robots.503.defer.visits.retries
    @@ -150,7 +151,7 @@ http.robots.503.defer.visits.delay 300000 Time in milliseconds to suspend crawling a host if the - robots.txt response is HTTP 5xx - see + robots.txt response is HTTP 5xx or 429 Too Many Requests - see http.robots.503.defer.visits. @@ -158,8 +159,8 @@ http.robots.503.defer.visits.retries 3 Number of retries crawling a host if the robots.txt - response is HTTP 5xx - see http.robots.503.defer.visits. After n - retries the host queue is dropped for this segment/cycle. + response is HTTP 5xx or 429 - see http.robots.503.defer.visits. + After n retries the host queue is dropped for this segment/cycle. diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 8d7263e3ea..ec5e77e433 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -229,7 +229,8 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, else if ((code == 403) && (!allowForbidden)) robotRules = FORBID_ALL_RULES; // use forbid all - else if (code >= 500) { + else if (code >= 500 || code == 429) { + // 5xx server errors or 429 Too Many Requests cacheRule = false; // try again later to fetch robots.txt if (deferVisits503) { // signal fetcher to suspend crawling for this host From d2c3e96d88818d8107f320c49e007329b020e090 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 9 Oct 2023 10:21:01 +0200 Subject: [PATCH 26/28] NUTCH-3012 SegmentReader when dumping with option -recode: NPE on unparsed documents - fall back to UTF-8 when stringifying the content of unparsed documents --- src/java/org/apache/nutch/segment/SegmentReader.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java index 14546af543..ee5c266fd0 100644 --- a/src/java/org/apache/nutch/segment/SegmentReader.java +++ b/src/java/org/apache/nutch/segment/SegmentReader.java @@ -163,13 +163,16 @@ public void reduce(Text key, Iterable values, dump.append("\nRecno:: ").append(recNo++).append("\n"); dump.append("URL:: " + key.toString() + "\n"); Content content = null; - Charset charset = null; + // fall-back encoding for content of unparsed documents + Charset charset = StandardCharsets.UTF_8; for (NutchWritable val : values) { Writable value = val.get(); // unwrap if (value instanceof CrawlDatum) { dump.append("\nCrawlDatum::\n").append(((CrawlDatum) value).toString()); } else if (value instanceof Content) { if (recodeContent) { + // output recoded content later when charset is extracted from HTML + // metadata hold in ParseData content = (Content) value; } else { dump.append("\nContent::\n").append(((Content) value).toString()); From 8431dcfe52f5395a0fd9e3c00db009dbb2bcf6f5 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 21 Oct 2023 11:09:31 -0700 Subject: [PATCH 27/28] NUTCH-3013 Employ commons-lang3's StopWatch to simplify timing logic (#788) --- .github/workflows/master-build.yml | 1 - .gitignore | 1 + src/java/org/apache/nutch/crawl/CrawlDb.java | 19 ++++++++------- .../org/apache/nutch/crawl/CrawlDbMerger.java | 16 ++++++------- .../apache/nutch/crawl/DeduplicationJob.java | 16 ++++++------- .../org/apache/nutch/crawl/Generator.java | 17 ++++++------- src/java/org/apache/nutch/crawl/Injector.java | 16 ++++++------- src/java/org/apache/nutch/crawl/LinkDb.java | 15 ++++++------ .../org/apache/nutch/crawl/LinkDbMerger.java | 16 ++++++------- .../org/apache/nutch/crawl/LinkDbReader.java | 24 +++++++++---------- .../org/apache/nutch/fetcher/Fetcher.java | 17 ++++++------- .../org/apache/nutch/hostdb/ReadHostDb.java | 15 ++++++------ .../org/apache/nutch/hostdb/UpdateHostDb.java | 16 ++++++------- .../org/apache/nutch/indexer/CleaningJob.java | 16 ++++++------- .../org/apache/nutch/indexer/IndexingJob.java | 16 ++++++------- .../org/apache/nutch/parse/ParseSegment.java | 21 +++++++--------- .../nutch/scoring/webgraph/LinkDumper.java | 17 ++++++------- .../nutch/scoring/webgraph/LinkRank.java | 16 ++++++------- .../nutch/scoring/webgraph/NodeDumper.java | 16 ++++++------- .../nutch/scoring/webgraph/ScoreUpdater.java | 16 ++++++------- .../nutch/scoring/webgraph/WebGraph.java | 24 +++++++++---------- .../org/apache/nutch/tools/FreeGenerator.java | 16 ++++++------- .../nutch/tools/arc/ArcSegmentCreator.java | 16 ++++++------- .../apache/nutch/tools/warc/WARCExporter.java | 15 ++++++------ .../nutch/util/CrawlCompletionStats.java | 15 ++++++------ .../nutch/util/ProtocolStatusStatistics.java | 19 +++++++-------- .../apache/nutch/util/SitemapProcessor.java | 12 ++++++---- .../nutch/util/domain/DomainStatistics.java | 16 ++++++------- .../urlfilter/api/RegexURLFilterBaseTest.java | 11 +++++---- .../regex/TestRegexURLNormalizer.java | 8 +++++-- 30 files changed, 234 insertions(+), 225 deletions(-) diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index e3ed11c869..ba1d470ece 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -22,7 +22,6 @@ on: branches: [ master ] pull_request: branches: [ master ] - jobs: build: diff --git a/.gitignore b/.gitignore index 0612a99c23..b466908527 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ naivebayes-model csvindexwriter lib/spotbugs-* ivy/dependency-check-ant/* +.gradle* diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index 3819bb3a01..16394832bf 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -19,14 +19,15 @@ import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -49,7 +50,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; /** * This class takes the output of the fetcher and updates the crawldb @@ -85,10 +85,11 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException, InterruptedException, ClassNotFoundException { - Path lock = lock(getConf(), crawlDb, force); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + + Path lock = lock(getConf(), crawlDb, force); Job job = CrawlDb.createJob(getConf(), crawlDb); Configuration conf = job.getConfiguration(); @@ -98,7 +99,7 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, boolean url404Purging = conf.getBoolean(CRAWLDB_PURGE_404, false); - LOG.info("CrawlDb update: starting at {}", sdf.format(start)); + LOG.info("CrawlDb update: starting"); LOG.info("CrawlDb update: db: {}", crawlDb); LOG.info("CrawlDb update: segments: {}", Arrays.asList(segments)); LOG.info("CrawlDb update: additions allowed: {}", additionsAllowed); @@ -151,9 +152,9 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, urlsFiltered); } - long end = System.currentTimeMillis(); - LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CrawlDb update: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } /* diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java index 70c65135ec..1bf7243d38 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java @@ -18,11 +18,12 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Map.Entry; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,7 +45,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * This tool merges several CrawlDb-s into one, optionally filtering URLs @@ -129,9 +129,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { Path lock = CrawlDb.lock(getConf(), output, false); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("CrawlDb merge: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("CrawlDb merge: starting"); Job job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { @@ -155,9 +155,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) NutchJob.cleanupAfterFailure(outPath, lock, fs); throw e; } - long end = System.currentTimeMillis(); - LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CrawlDb merge: finished, elapsed: {}", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static Job createMergeJob(Configuration conf, Path output, diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index ae5ac37ce0..217005d415 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -21,11 +21,12 @@ import java.lang.invoke.MethodHandles; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; -import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -48,7 +49,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -298,9 +298,9 @@ public int run(String[] args) throws IOException { } } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("DeduplicationJob: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("DeduplicationJob: starting"); Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); @@ -381,9 +381,9 @@ public int run(String[] args) throws IOException { // clean up fs.delete(tempDir, true); - long end = System.currentTimeMillis(); - LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Deduplication finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index d1569e1f03..1b62314e7a 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -30,7 +30,9 @@ import java.util.Locale; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configurable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -76,7 +78,6 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.SegmentReaderUtil; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** @@ -821,10 +822,10 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, Path lock = CrawlDb.lock(getConf(), dbDir, force); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Generator: starting at " + sdf.format(start)); - LOG.info("Generator: Selecting best-scoring urls due for fetch."); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Generator: starting"); + LOG.info("Generator: selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { @@ -982,9 +983,9 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, } fs.delete(tempDir, true); - long end = System.currentTimeMillis(); - LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Generator: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 9fca719f62..9bfd1b4547 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.crawl; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -45,17 +46,16 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; /** * Injector takes a flat text file of URLs (or a folder containing text files) @@ -372,10 +372,11 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, boolean update, boolean normalize, boolean filter, boolean filterNormalizeAll) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Injector: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + + LOG.info("Injector: starting"); LOG.info("Injector: crawlDb: {}", crawlDb); LOG.info("Injector: urlDir: {}", urlDir); LOG.info("Injector: Converting injected urls to crawl db entries."); @@ -479,9 +480,8 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, urlsPurged404); } - long end = System.currentTimeMillis(); - LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end), - TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Injector: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); } } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) { LOG.error("Injector job failed: {}", e.getMessage()); diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java index 2b3d2ed907..3c752ab1db 100644 --- a/src/java/org/apache/nutch/crawl/LinkDb.java +++ b/src/java/org/apache/nutch/crawl/LinkDb.java @@ -21,13 +21,14 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -54,7 +55,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; /** Maintains an inverted link map, listing incoming links for each url. */ public class LinkDb extends NutchTool implements Tool { @@ -196,9 +196,9 @@ public void invert(Path linkDb, Path[] segments, boolean normalize, Path currentLinkDb = new Path(linkDb, CURRENT_NAME); Configuration conf = job.getConfiguration(); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("LinkDb: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb: starting"); LOG.info("LinkDb: linkdb: {}", linkDb); LOG.info("LinkDb: URL normalize: {}", normalize); LOG.info("LinkDb: URL filter: {}", filter); @@ -260,8 +260,9 @@ public void invert(Path linkDb, Path[] segments, boolean normalize, } LinkDb.install(job, linkDb); - long end = System.currentTimeMillis(); - LOG.info("LinkDb: finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } private static Job createJob(Configuration config, Path linkDb, diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java index f696c599e8..d6a41ab48c 100644 --- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java +++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java @@ -18,11 +18,12 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Iterator; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -41,7 +42,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * This tool merges several LinkDb-s into one, optionally filtering URLs through @@ -112,9 +112,9 @@ public void reduce(Text key, Iterable values, Context context) public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("LinkDb merge: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb merge: starting"); Job job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { @@ -137,9 +137,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME)); - long end = System.currentTimeMillis(); - LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb merge: finished, elapsed: {} ms" + stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static Job createMergeJob(Configuration config, Path linkDb, diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java index c307b985d5..fa01f20bf3 100644 --- a/src/java/org/apache/nutch/crawl/LinkDbReader.java +++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java @@ -16,13 +16,15 @@ */ package org.apache.nutch.crawl; +import java.io.Closeable; import java.io.IOException; - import java.lang.invoke.MethodHandles; +import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.Iterator; -// Commons Logging imports +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,11 +48,8 @@ import org.apache.nutch.util.AbstractChecker; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; -import java.text.SimpleDateFormat; -import java.util.Iterator; -import java.io.Closeable; + /** * Read utility for the LinkDb. @@ -153,10 +152,9 @@ public void map(Text key, Inlinks value, Context context) public void processDumpJob(String linkdb, String output, String regex) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - - LOG.info("LinkDb dump: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb dump: starting"); LOG.info("LinkDb dump: db: {}", linkdb); Path outFolder = new Path(output); @@ -192,9 +190,9 @@ public void processDumpJob(String linkdb, String output, String regex) throw e; } - long end = System.currentTimeMillis(); - LOG.info("LinkDb dump: finished at {}, elapsed: {}", - sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb dump: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } @Override diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 3727dcebef..92aef6f106 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -25,9 +25,11 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; @@ -454,11 +456,10 @@ public void fetch(Path segment, int threads) throws IOException, checkConfiguration(); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("Fetcher: starting at {}", TimingUtil.logDateMillis(start)); - LOG.info("Fetcher: segment: {}", segment); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Fetcher: starting"); + LOG.info("Fetcher: segment: {}", segment); // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task @@ -530,9 +531,9 @@ public void fetch(Path segment, int threads) throws IOException, throw e; } - long end = System.currentTimeMillis(); - LOG.info("Fetcher: finished at {}, elapsed: {}", - TimingUtil.logDateMillis(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } /** diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index ffddb18898..0321a8652c 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -18,9 +18,10 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -42,7 +43,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.SegmentReaderUtil; import org.apache.commons.jexl3.JexlBuilder; @@ -168,9 +168,9 @@ public void map(Text key, HostDatum datum, Context context) throws IOException, // } private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean dumpHostnames, String expr) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ReadHostDb: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ReadHostDb: starting"); Configuration conf = getConf(); conf.setBoolean(HOSTDB_DUMP_HOMEPAGES, dumpHomepages); @@ -211,8 +211,9 @@ private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean throw e; } - long end = System.currentTimeMillis(); - LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ReadHostDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } private void getHostDbRecord(Path hostDb, String host) throws Exception { diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java index ffa68d0963..65e45c55d8 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java @@ -17,9 +17,10 @@ package org.apache.nutch.hostdb; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -40,7 +41,6 @@ import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -73,9 +73,9 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, boolean checkFailed, boolean checkNew, boolean checkKnown, boolean force, boolean filter, boolean normalize) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("UpdateHostDb: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("UpdateHostDb: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -149,9 +149,9 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, } LockUtil.removeLockFile(fs, lock); - long end = System.currentTimeMillis(); - LOG.info("UpdateHostDb: finished at " + sdf.format(end) + - ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("UpdateHostDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String args[]) throws Exception { diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java index dc3ed69e4a..04b9c2efa5 100644 --- a/src/java/org/apache/nutch/indexer/CleaningJob.java +++ b/src/java/org/apache/nutch/indexer/CleaningJob.java @@ -18,7 +18,9 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.ByteWritable; @@ -36,7 +38,6 @@ import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -139,9 +140,9 @@ public void reduce(ByteWritable key, Iterable values, public void delete(String crawldb, boolean noCommit) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("CleaningJob: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("CleaningJob: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -173,9 +174,8 @@ public void delete(String crawldb, boolean noCommit) throw e; } - long end = System.currentTimeMillis(); - LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CleaningJob: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); } @Override diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java index ff46bc0eff..d2115230c8 100644 --- a/src/java/org/apache/nutch/indexer/IndexingJob.java +++ b/src/java/org/apache/nutch/indexer/IndexingJob.java @@ -19,7 +19,6 @@ import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -27,7 +26,9 @@ import java.util.Locale; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.segment.SegmentChecker; import org.apache.hadoop.conf.Configuration; @@ -44,7 +45,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -104,9 +104,9 @@ public void index(Path crawlDb, Path linkDb, List segments, boolean filter, boolean normalize, boolean addBinaryContent, boolean base64) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Indexer: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Indexer: starting"); final Job job = NutchJob.getInstance(getConf()); job.setJobName("Indexer"); @@ -159,9 +159,9 @@ public void index(Path crawlDb, Path linkDb, List segments, String.format(Locale.ROOT, "%6d", counter.getValue()), counter.getName()); } - long end = System.currentTimeMillis(); - LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Indexer: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } finally { tmp.getFileSystem(conf).delete(tmp, true); } diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index c4e271feec..de45c463b9 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.parse; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.nutch.crawl.CrawlDatum; @@ -25,7 +26,6 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.TimingUtil; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -50,13 +50,12 @@ import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; -import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; /* Parse content in a segment. */ public class ParseSegment extends NutchTool implements Tool { @@ -228,12 +227,10 @@ public void parse(Path segment) throws IOException, return; } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("ParseSegment: starting at {}", sdf.format(start)); - LOG.info("ParseSegment: segment: {}", segment); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ParseSegment: starting"); + LOG.info("ParseSegment: segment: {}", segment); Job job = NutchJob.getInstance(getConf()); job.setJobName("parse " + segment); @@ -263,9 +260,9 @@ public void parse(Path segment) throws IOException, throw e; } - long end = System.currentTimeMillis(); - LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ParseSegment: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java index 54cd8b8ed1..4831d73f38 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java @@ -20,10 +20,11 @@ import java.io.DataOutput; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import java.util.Random; +import java.util.concurrent.TimeUnit; + import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; @@ -31,6 +32,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -57,7 +59,6 @@ import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * The LinkDumper tool creates a database of node to inlink information that can @@ -327,9 +328,9 @@ public void reduce(Text key, Iterable values, public void dumpLinks(Path webGraphDb) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("NodeDumper: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("NodeDumper: starting"); Configuration conf = getConf(); FileSystem fs = webGraphDb.getFileSystem(conf); @@ -400,9 +401,9 @@ public void dumpLinks(Path webGraphDb) throws IOException, } fs.delete(tempInverted, true); - long end = System.currentTimeMillis(); - LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDumper: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java index 739fe6cec1..c226ad130b 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java @@ -21,12 +21,12 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -35,6 +35,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -65,7 +66,6 @@ import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; public class LinkRank extends Configured implements Tool { @@ -651,9 +651,9 @@ public LinkRank(Configuration conf) { public void analyze(Path webGraphDb) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Analysis: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkRank Analysis: starting"); // store the link rank under the webgraphdb temporarily, final scores get // upddated into the nodedb @@ -714,9 +714,9 @@ public void analyze(Path webGraphDb) throws IOException, // remove the temporary link rank folder fs.delete(linkRank, true); - long end = System.currentTimeMillis(); - LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkRank Analysis: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java index ede9fa1c59..dfccccc19e 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java @@ -18,7 +18,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -27,6 +27,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -48,7 +49,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** @@ -293,9 +293,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("NodeDumper: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("NodeDumper: starting"); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Job dumper = NutchJob.getInstance(getConf()); @@ -357,9 +357,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, LOG.error("NodeDumper job failed:", e); throw e; } - long end = System.currentTimeMillis(); - LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("NodeDumper: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java index 130e1b2a1c..c10a6e37b0 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java +++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java @@ -18,8 +18,8 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Random; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -28,6 +28,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -51,7 +52,6 @@ import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * Updates the score from the WebGraph node database into the crawl database. @@ -156,9 +156,9 @@ public void reduce(Text key, Iterable values, public void update(Path crawlDb, Path webGraphDb) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ScoreUpdater: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ScoreUpdater: starting"); Configuration conf = getConf(); @@ -213,9 +213,9 @@ public void update(Path crawlDb, Path webGraphDb) throws IOException, LOG.info("ScoreUpdater: installing new crawldb " + crawlDb); CrawlDb.install(updater, crawlDb); - long end = System.currentTimeMillis(); - LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ScoreUpdater: finished, elapsed: {} ms ", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java index 63d0ead7da..b98329d1e0 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java +++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashMap; @@ -26,6 +25,7 @@ import java.util.Map; import java.util.Random; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -34,6 +34,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -67,7 +68,6 @@ import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** @@ -518,14 +518,12 @@ public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("WebGraphDb: starting at " + sdf.format(start)); - LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); - LOG.info("WebGraphDb: URL normalize: " + normalize); - LOG.info("WebGraphDb: URL filter: " + filter); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("WebGraphDb: starting"); + LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); + LOG.info("WebGraphDb: URL normalize: " + normalize); + LOG.info("WebGraphDb: URL filter: " + filter); FileSystem fs = webGraphDb.getFileSystem(getConf()); @@ -715,9 +713,9 @@ public void createWebGraph(Path webGraphDb, Path[] segments, // remove the lock file for the webgraph LockUtil.removeLockFile(fs, lock); - long end = System.currentTimeMillis(); - LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("WebGraphDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java index 039bccaece..e9f5c87619 100644 --- a/src/java/org/apache/nutch/tools/FreeGenerator.java +++ b/src/java/org/apache/nutch/tools/FreeGenerator.java @@ -18,10 +18,11 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; @@ -47,7 +48,6 @@ import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * This tool generates fetchlists (segments to be fetched) from plain text files @@ -180,9 +180,9 @@ public int run(String[] args) throws Exception { } } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("FreeGenerator: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("FreeGenerator: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -226,9 +226,9 @@ public int run(String[] args) throws Exception { LOG.error("FAILED: " + StringUtils.stringifyException(e)); return -1; } - long end = System.currentTimeMillis(); - LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("FreeGenerator: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java index 4e916dbd50..825e752cc0 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java +++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java @@ -21,7 +21,9 @@ import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -56,7 +58,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.TimingUtil; /** *

    @@ -368,10 +369,10 @@ public void map(Text key, BytesWritable bytes, public void createSegments(Path arcFiles, Path segmentsOutDir) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); if (LOG.isInfoEnabled()) { - LOG.info("ArcSegmentCreator: starting at " + sdf.format(start)); + LOG.info("ArcSegmentCreator: starting"); LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles); } @@ -402,10 +403,9 @@ public void createSegments(Path arcFiles, Path segmentsOutDir) throw e; } - - long end = System.currentTimeMillis(); - LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) - + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ArcSegmentCreator: finished, elapsed: {} ms" + stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String args[]) throws Exception { diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index cf000ba526..6d8a385572 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -29,8 +29,10 @@ import java.util.List; import java.util.Locale; import java.util.UUID; +import java.util.concurrent.TimeUnit; import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; @@ -58,7 +60,6 @@ import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -428,9 +429,9 @@ protected JsonObject metadataToJson(Metadata meta) { public int generateWARC(String output, List segments, boolean onlySuccessfulResponses, boolean includeParseData, boolean includeParseText) throws IOException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("WARCExporter: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("WARCExporter: starting"); final Job job = NutchJob.getInstance(getConf()); job.setJobName("warc-exporter " + output); @@ -479,9 +480,9 @@ public int generateWARC(String output, List segments, throw new RuntimeException(message); } LOG.info(job.getCounters().toString()); - long end = System.currentTimeMillis(); - LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end), - TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("WARCExporter: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("WARCExporter job failed: {}", e.getMessage()); return -1; diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java index 7210ee83af..8696d28221 100644 --- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java +++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java @@ -20,7 +20,7 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -30,6 +30,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -127,9 +128,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[3]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("CrawlCompletionStats: starting"); int mode = 0; String jobName = "CrawlCompletionStats"; @@ -180,9 +181,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}", - sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CrawlCompletionStats: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java index 2499da0bfb..0fe6c57d03 100644 --- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java +++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java @@ -16,10 +16,11 @@ */ package org.apache.nutch.util; -import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -37,8 +38,6 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.metadata.Nutch; /** @@ -86,9 +85,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[2]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ProtocolStatistics: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ProtocolStatistics: starting"); String jobName = "ProtocolStatistics"; @@ -130,9 +129,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ProtocolStatistics: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index 98f7df839d..66fa9b0e7a 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -22,7 +22,9 @@ import java.util.Collection; import java.util.List; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; @@ -359,8 +361,9 @@ else if(sitemapDatum != null) { public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter, boolean normalize, int threads) throws Exception { - long start = System.currentTimeMillis(); - LOG.info("SitemapProcessor: Starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("SitemapProcessor: starting"); FileSystem fs = crawldb.getFileSystem(getConf()); Path old = new Path(crawldb, "old"); @@ -441,8 +444,9 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches); LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries); - long end = System.currentTimeMillis(); - LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("SitemapProcessor_" + crawldb.toString(), e); diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java index 638b6c94f1..f77b72bc5f 100644 --- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java @@ -20,8 +20,9 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -39,7 +40,6 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -92,9 +92,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[3]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("DomainStatistics: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("DomainStatistics: starting"); int mode = 0; String jobName = "DomainStatistics"; @@ -151,9 +151,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("DomainStatistics: finished, elapsed: {} ms ", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java index c77c67eb17..080b2e5870 100644 --- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java +++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java @@ -16,7 +16,6 @@ */ package org.apache.nutch.urlfilter.api; -// JDK imports import java.lang.invoke.MethodHandles; import java.io.BufferedReader; import java.io.FileReader; @@ -24,12 +23,13 @@ import java.io.Reader; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.junit.Assert; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -// Nutch imports import org.apache.nutch.net.URLFilter; /** @@ -58,7 +58,8 @@ protected void bench(int loops, String file) { } protected void bench(int loops, Reader rules, Reader urls) { - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); try { URLFilter filter = getURLFilter(rules); FilteredURL[] expected = readURLFile(urls); @@ -68,8 +69,8 @@ protected void bench(int loops, Reader rules, Reader urls) { } catch (Exception e) { Assert.fail(e.toString()); } - LOG.info("bench time (" + loops + ") " - + (System.currentTimeMillis() - start) + "ms"); + stopWatch.stop(); + LOG.info("bench time {} loops {} ms", loops, stopWatch.getTime(TimeUnit.MILLISECONDS)); } protected void bench(int loops, String rulesFile, String urlsFile) { diff --git a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java index 1eee7183b7..4952a1da4c 100644 --- a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java +++ b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java @@ -25,11 +25,13 @@ import java.io.IOException; import java.io.InputStreamReader; import java.util.*; +import java.util.concurrent.TimeUnit; import org.junit.Assert; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NutchConfiguration; @@ -104,7 +106,8 @@ private void normalizeTest(NormalizedURL[] urls, String scope) } private void bench(int loops, String scope) { - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); try { NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope); if (expected == null) @@ -115,8 +118,9 @@ private void bench(int loops, String scope) { } catch (Exception e) { Assert.fail(e.toString()); } + stopWatch.stop(); LOG.info("bench time (" + loops + ") " - + (System.currentTimeMillis() - start) + "ms"); + + (stopWatch.getTime(TimeUnit.MILLISECONDS)) + "ms"); } private static class NormalizedURL { From 792ed28914f4beb2fb8b8ce28eebe17196c92af1 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Fri, 27 Oct 2023 15:04:22 -0700 Subject: [PATCH 28/28] NUTCH-3015 Add more CI steps to GitHub master-build.yml (#790) --- .github/workflows/dependency-check.yml | 37 +++++++++++ .github/workflows/master-build.yml | 64 +++++++++++++++---- .gitignore | 1 + build.xml | 52 ++++++++++++--- .../dependency-check-suppressions.xml | 5 -- src/java/overview.html | 16 +++++ .../creativecommons/conf/crawl-urlfilter.txt | 15 +++++ .../creativecommons/conf/nutch-site.xml | 16 +++++ src/plugin/creativecommons/data/anchor.html | 16 +++++ src/plugin/creativecommons/data/rdf.html | 16 +++++ src/plugin/creativecommons/data/rel.html | 16 +++++ src/plugin/creativecommons/ivy.xml | 1 - src/plugin/exchange-jexl/README.md | 17 +++++ src/plugin/exchange-jexl/ivy.xml | 1 - src/plugin/feed/ivy.xml | 1 - src/plugin/headings/ivy.xml | 1 - src/plugin/index-anchor/ivy.xml | 1 - src/plugin/index-basic/ivy.xml | 1 - src/plugin/index-geoip/ivy.xml | 1 - src/plugin/index-geoip/plugin.xml | 1 + src/plugin/index-jexl-filter/ivy.xml | 1 - src/plugin/index-links/README.md | 17 +++++ src/plugin/index-links/ivy.xml | 1 - src/plugin/index-metadata/ivy.xml | 1 - src/plugin/index-more/ivy.xml | 1 - src/plugin/index-replace/ivy.xml | 1 - .../sample/testIndexReplace.html | 16 +++++ src/plugin/index-static/ivy.xml | 1 - src/plugin/indexer-cloudsearch/README.md | 17 +++++ .../indexer-cloudsearch/createCSDomain.sh | 15 +++++ src/plugin/indexer-csv/README.md | 17 +++++ src/plugin/indexer-csv/ivy.xml | 1 - src/plugin/indexer-dummy/README.md | 17 +++++ src/plugin/indexer-dummy/ivy.xml | 1 - src/plugin/indexer-elastic/README.md | 17 +++++ ...wto_upgrade_es.txt => howto_upgrade_es.md} | 17 +++++ src/plugin/indexer-kafka/ivy.xml | 1 - src/plugin/indexer-opensearch-1x/README.md | 17 +++++ ...search.txt => howto_upgrade_opensearch.md} | 17 +++++ src/plugin/indexer-rabbit/README.md | 17 +++++ src/plugin/indexer-rabbit/ivy.xml | 1 - src/plugin/indexer-solr/README.md | 17 +++++ ...upgrade_solr.txt => howto_upgrade_solr.md} | 17 +++++ src/plugin/indexer-solr/ivy.xml | 25 +++++--- src/plugin/indexer-solr/plugin.xml | 26 +++++--- src/plugin/language-identifier/ivy.xml | 1 - src/plugin/lib-htmlunit/ivy.xml | 1 - src/plugin/lib-http/ivy.xml | 1 - src/plugin/lib-nekohtml/ivy.xml | 1 - src/plugin/lib-rabbitmq/ivy.xml | 1 - src/plugin/lib-regex-filter/ivy.xml | 1 - src/plugin/lib-selenium/README.md | 17 +++++ .../lib-selenium/howto_upgrade_selenium.md | 32 ++++++++++ .../lib-selenium/howto_upgrade_selenium.txt | 15 ----- src/plugin/lib-selenium/ivy.xml | 1 - src/plugin/lib-xml/ivy.xml | 1 - src/plugin/microformats-reltag/ivy.xml | 1 - src/plugin/mimetype-filter/ivy.xml | 1 - src/plugin/nutch-extensionpoints/ivy.xml | 1 - src/plugin/parse-ext/command | 15 +++++ src/plugin/parse-ext/ivy.xml | 1 - src/plugin/parse-html/ivy.xml | 1 - src/plugin/parse-js/ivy.xml | 1 - .../sample/parse_embedded_js_test.html | 16 +++++ .../parse-js/sample/parse_pure_js_test.js | 15 +++++ src/plugin/parse-metatags/ivy.xml | 1 - .../parse-metatags/sample/testMetatags.html | 16 +++++ .../sample/testMultivalueMetatags.html | 16 +++++ ...upgrade_tika.txt => howto_upgrade_tika.md} | 17 +++++ src/plugin/parse-tika/ivy.xml | 1 - src/plugin/parse-tika/sample/nutch.html | 16 +++++ src/plugin/parse-zip/ivy.xml | 1 - src/plugin/parsefilter-debug/ivy.xml | 1 - src/plugin/parsefilter-naivebayes/ivy.xml | 1 - .../data/regex-parsefilter.txt | 15 +++++ src/plugin/parsefilter-regex/ivy.xml | 1 - src/plugin/protocol-file/ivy.xml | 1 - .../protocol-file/sample/testprotocolfile.txt | 15 +++++ .../sample/testprotocolfile_(encoded).txt | 15 +++++ src/plugin/protocol-foo/ivy.xml | 1 - src/plugin/protocol-foo/plugin.xml | 1 - src/plugin/protocol-ftp/ivy.xml | 1 - src/plugin/protocol-htmlunit/ivy.xml | 1 - src/plugin/protocol-http/ivy.xml | 1 - src/plugin/protocol-httpclient/ivy.xml | 1 - .../protocol-interactiveselenium/README.md | 17 +++++ .../protocol-interactiveselenium/ivy.xml | 1 - ...ade_okhttp.txt => howto_upgrade_okhttp.md} | 17 +++++ src/plugin/protocol-okhttp/ivy.xml | 1 - src/plugin/protocol-selenium/README.md | 17 +++++ src/plugin/protocol-selenium/ivy.xml | 1 - src/plugin/publish-rabbitmq/ivy.xml | 1 - src/plugin/scoring-depth/ivy.xml | 1 - src/plugin/scoring-link/ivy.xml | 1 - src/plugin/scoring-metadata/ivy.xml | 1 - src/plugin/scoring-opic/ivy.xml | 1 - src/plugin/scoring-orphan/ivy.xml | 1 - src/plugin/scoring-similarity/ivy.xml | 1 - src/plugin/subcollection/ivy.xml | 1 - src/plugin/tld/ivy.xml | 1 - src/plugin/urlfilter-automaton/ivy.xml | 1 - src/plugin/urlfilter-domain/data/hosts.txt | 15 +++++ src/plugin/urlfilter-domain/ivy.xml | 1 - .../urlfilter-domaindenylist/data/hosts.txt | 15 +++++ src/plugin/urlfilter-domaindenylist/ivy.xml | 1 - src/plugin/urlfilter-fast/README.md | 16 +++++ src/plugin/urlfilter-fast/ivy.xml | 1 - src/plugin/urlfilter-ignoreexempt/README.md | 17 +++++ src/plugin/urlfilter-ignoreexempt/ivy.xml | 1 - src/plugin/urlfilter-prefix/ivy.xml | 1 - src/plugin/urlfilter-regex/ivy.xml | 1 - src/plugin/urlfilter-suffix/ivy.xml | 1 - src/plugin/urlfilter-validator/ivy.xml | 1 - src/plugin/urlmeta/ivy.xml | 1 - src/plugin/urlnormalizer-ajax/ivy.xml | 1 - src/plugin/urlnormalizer-basic/ivy.xml | 1 - src/plugin/urlnormalizer-host/data/hosts.txt | 15 +++++ src/plugin/urlnormalizer-host/ivy.xml | 1 - src/plugin/urlnormalizer-pass/ivy.xml | 1 - .../urlnormalizer-protocol/data/protocols.txt | 15 +++++ src/plugin/urlnormalizer-protocol/ivy.xml | 1 - src/plugin/urlnormalizer-querystring/ivy.xml | 1 - src/plugin/urlnormalizer-regex/ivy.xml | 1 - .../sample/regex-normalize-default.test | 15 +++++ .../sample/regex-normalize-scope1.test | 15 +++++ .../urlnormalizer-slash/data/slashes.txt | 15 +++++ src/plugin/urlnormalizer-slash/ivy.xml | 1 - src/test/crawl-tests.xml | 16 +++++ src/test/filter-all.txt | 15 +++++ src/test/log4j.properties | 15 +++++ src/test/nutch-site.xml | 16 +++++ .../fetch-test-site/dup_of_pagea.html | 16 +++++ .../fetch-test-site/exception.html | 16 +++++ src/testresources/fetch-test-site/index.html | 16 +++++ .../fetch-test-site/nested_spider_trap.html | 16 +++++ src/testresources/fetch-test-site/pagea.html | 16 +++++ src/testresources/fetch-test-site/pageb.html | 16 +++++ src/testresources/fetch-test-site/robots.txt | 14 ++++ 138 files changed, 1060 insertions(+), 136 deletions(-) create mode 100644 .github/workflows/dependency-check.yml rename src/plugin/indexer-elastic/{howto_upgrade_es.txt => howto_upgrade_es.md} (61%) rename src/plugin/indexer-opensearch-1x/{howto_upgrade_opensearch.txt => howto_upgrade_opensearch.md} (62%) rename src/plugin/indexer-solr/{howto_upgrade_solr.txt => howto_upgrade_solr.md} (60%) create mode 100644 src/plugin/lib-selenium/howto_upgrade_selenium.md delete mode 100644 src/plugin/lib-selenium/howto_upgrade_selenium.txt rename src/plugin/parse-tika/{howto_upgrade_tika.txt => howto_upgrade_tika.md} (73%) rename src/plugin/protocol-okhttp/{howto_upgrade_okhttp.txt => howto_upgrade_okhttp.md} (52%) diff --git a/.github/workflows/dependency-check.yml b/.github/workflows/dependency-check.yml new file mode 100644 index 0000000000..f07f746a0d --- /dev/null +++ b/.github/workflows/dependency-check.yml @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: master pr build + +on: + schedule: + - cron: '0 0 * * *' # every day at midnight + +jobs: + dependency-check: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + - name: Dependency check + run: ant clean dependency-check -buildfile build.xml diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index ba1d470ece..e0af58df06 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -1,4 +1,3 @@ -# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -13,28 +12,67 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -name: master pr build +name: master pull request ci on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + types: [opened, synchronize, reopened] + branches: [master] jobs: - build: - runs-on: ubuntu-latest + javadoc: strategy: matrix: - java: [ '11' ] - + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + - name: Javadoc + run: ant clean javadoc -buildfile build.xml + rat: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + - name: Run Apache Rat + run: ant clean run-rat -buildfile build.xml + - name: Cache unknown licenses + run: echo "UNKNOWN_LICENSES=$(sed -n 18p /home/runner/work/nutch/nutch/build/apache-rat-report.txt)" >> $GITHUB_ENV + - name: Versions + run: | + echo $UNKNOWN_LICENSES + - name: Fail if any unknown licenses + if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }} + run: exit 1 + test: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest, macos-latest] + runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} - - name: Build with Ant - run: ant clean nightly javadoc -buildfile build.xml + distribution: 'temurin' + - name: Test + run: ant clean test -buildfile build.xml diff --git a/.gitignore b/.gitignore index b466908527..12365dd0d4 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ csvindexwriter lib/spotbugs-* ivy/dependency-check-ant/* .gradle* +ivy/apache-rat-* diff --git a/build.xml b/build.xml index b44581405a..dd9797302b 100644 --- a/build.xml +++ b/build.xml @@ -38,7 +38,7 @@ - + @@ -48,7 +48,7 @@ - + @@ -640,13 +640,15 @@ - + + reportformat="ALL" + assemblyAnalyzerEnabled="false" + failBuildOnCVSS="1"> @@ -1025,7 +1027,7 @@ - - + @@ -1047,8 +1049,40 @@ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ivy/dependency-check-ant/dependency-check-suppressions.xml b/ivy/dependency-check-ant/dependency-check-suppressions.xml index e7de8febb2..a7f4ca16df 100644 --- a/ivy/dependency-check-ant/dependency-check-suppressions.xml +++ b/ivy/dependency-check-ant/dependency-check-suppressions.xml @@ -1,8 +1,3 @@ - - only applies to tika-server < 1.18 - ^org\.(apache\.tika:tika-(core|parsers)|gagravarr:vorbis-java-tika):.*$ - CVE-2018-1335 - diff --git a/src/java/overview.html b/src/java/overview.html index 11321417ba..3de53a7d28 100644 --- a/src/java/overview.html +++ b/src/java/overview.html @@ -1,3 +1,19 @@ + Apache Nutch diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt b/src/plugin/creativecommons/conf/crawl-urlfilter.txt index 324617f07a..eb6786e4b4 100644 --- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt +++ b/src/plugin/creativecommons/conf/crawl-urlfilter.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Creative Commnons crawl filter # Each non-comment, non-blank line contains a regular expression diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml index e28e12a9a8..4b343b2cc9 100644 --- a/src/plugin/creativecommons/conf/nutch-site.xml +++ b/src/plugin/creativecommons/conf/nutch-site.xml @@ -1,5 +1,21 @@ + diff --git a/src/plugin/creativecommons/data/anchor.html b/src/plugin/creativecommons/data/anchor.html index 90b522759d..3267bc9ea8 100755 --- a/src/plugin/creativecommons/data/anchor.html +++ b/src/plugin/creativecommons/data/anchor.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/data/rdf.html b/src/plugin/creativecommons/data/rdf.html index fb2c34dfe5..60c27cc541 100755 --- a/src/plugin/creativecommons/data/rdf.html +++ b/src/plugin/creativecommons/data/rdf.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/data/rel.html b/src/plugin/creativecommons/data/rel.html index 413d52f869..3d11572d82 100755 --- a/src/plugin/creativecommons/data/rel.html +++ b/src/plugin/creativecommons/data/rel.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/ivy.xml b/src/plugin/creativecommons/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/creativecommons/ivy.xml +++ b/src/plugin/creativecommons/ivy.xml @@ -1,5 +1,4 @@ - + exchange-jexl plugin for Nutch ============================== diff --git a/src/plugin/exchange-jexl/ivy.xml b/src/plugin/exchange-jexl/ivy.xml index 1275664e5d..cb5a0f1862 100644 --- a/src/plugin/exchange-jexl/ivy.xml +++ b/src/plugin/exchange-jexl/ivy.xml @@ -1,5 +1,4 @@ - + indexer-links plugin for Nutch ============================== diff --git a/src/plugin/index-links/ivy.xml b/src/plugin/index-links/ivy.xml index 624dcaf4a2..3d4fc905c3 100644 --- a/src/plugin/index-links/ivy.xml +++ b/src/plugin/index-links/ivy.xml @@ -1,5 +1,4 @@ - Testing the power of the index-replace plugin diff --git a/src/plugin/index-static/ivy.xml b/src/plugin/index-static/ivy.xml index 1275664e5d..cb5a0f1862 100644 --- a/src/plugin/index-static/ivy.xml +++ b/src/plugin/index-static/ivy.xml @@ -1,5 +1,4 @@ - + AWS CloudSearch plugin for Nutch ================================ diff --git a/src/plugin/indexer-cloudsearch/createCSDomain.sh b/src/plugin/indexer-cloudsearch/createCSDomain.sh index 24fb0156c6..1cb8481fe0 100644 --- a/src/plugin/indexer-cloudsearch/createCSDomain.sh +++ b/src/plugin/indexer-cloudsearch/createCSDomain.sh @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # example of domain configuration for CloudSearch DOMAIN="$1" diff --git a/src/plugin/indexer-csv/README.md b/src/plugin/indexer-csv/README.md index 80220974a7..4d1288b198 100644 --- a/src/plugin/indexer-csv/README.md +++ b/src/plugin/indexer-csv/README.md @@ -1,3 +1,20 @@ + + indexer-csv plugin for Nutch ============================ diff --git a/src/plugin/indexer-csv/ivy.xml b/src/plugin/indexer-csv/ivy.xml index 75b5d54e55..e7bf875468 100644 --- a/src/plugin/indexer-csv/ivy.xml +++ b/src/plugin/indexer-csv/ivy.xml @@ -1,5 +1,4 @@ - + indexer-dummy plugin for Nutch ============================== diff --git a/src/plugin/indexer-dummy/ivy.xml b/src/plugin/indexer-dummy/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/indexer-dummy/ivy.xml +++ b/src/plugin/indexer-dummy/ivy.xml @@ -1,5 +1,4 @@ - + indexer-elastic plugin for Nutch ================================ diff --git a/src/plugin/indexer-elastic/howto_upgrade_es.txt b/src/plugin/indexer-elastic/howto_upgrade_es.md similarity index 61% rename from src/plugin/indexer-elastic/howto_upgrade_es.txt rename to src/plugin/indexer-elastic/howto_upgrade_es.md index a8156444c6..b57e0c02fa 100644 --- a/src/plugin/indexer-elastic/howto_upgrade_es.txt +++ b/src/plugin/indexer-elastic/howto_upgrade_es.md @@ -1,3 +1,20 @@ + + 1. Upgrade Elasticsearch dependency in src/plugin/indexer-elastic/ivy.xml 2. Upgrade the Elasticsearch specific dependencies in src/plugin/indexer-elastic/plugin.xml diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml index 7bdd94324a..9d605c50b5 100644 --- a/src/plugin/indexer-kafka/ivy.xml +++ b/src/plugin/indexer-kafka/ivy.xml @@ -1,5 +1,4 @@ - + indexer-opensearch1x plugin for Nutch ================================ diff --git a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md similarity index 62% rename from src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt rename to src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md index 0725900445..c9b723ffcf 100644 --- a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt +++ b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md @@ -1,3 +1,20 @@ + + 1. Upgrade OpenSearch dependency in src/plugin/indexer-opensearch-1x/ivy.xml 2. Upgrade the OpenSearch specific dependencies in src/plugin/indexer-opensearch-1x/plugin.xml diff --git a/src/plugin/indexer-rabbit/README.md b/src/plugin/indexer-rabbit/README.md index 6ea09a9151..8040cd6c76 100644 --- a/src/plugin/indexer-rabbit/README.md +++ b/src/plugin/indexer-rabbit/README.md @@ -1,3 +1,20 @@ + + indexer-rabbit plugin for Nutch =============================== diff --git a/src/plugin/indexer-rabbit/ivy.xml b/src/plugin/indexer-rabbit/ivy.xml index dd450cf7f0..d2daf91dad 100644 --- a/src/plugin/indexer-rabbit/ivy.xml +++ b/src/plugin/indexer-rabbit/ivy.xml @@ -1,5 +1,4 @@ - + indexer-solr plugin for Nutch ============================= diff --git a/src/plugin/indexer-solr/howto_upgrade_solr.txt b/src/plugin/indexer-solr/howto_upgrade_solr.md similarity index 60% rename from src/plugin/indexer-solr/howto_upgrade_solr.txt rename to src/plugin/indexer-solr/howto_upgrade_solr.md index b2a7eb5c89..905fb84a9e 100644 --- a/src/plugin/indexer-solr/howto_upgrade_solr.txt +++ b/src/plugin/indexer-solr/howto_upgrade_solr.md @@ -1,3 +1,20 @@ + + 1. Upgrade Solr dependency in src/plugin/indexer-solr/ivy.xml 2. Upgrade the Solr specific dependencies in src/plugin/indexer-solr/plugin.xml diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml index ce59942daf..ab5fd72c7a 100644 --- a/src/plugin/indexer-solr/ivy.xml +++ b/src/plugin/indexer-solr/ivy.xml @@ -1,15 +1,20 @@ + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml index f672ac9ed0..21cc7d8bdf 100644 --- a/src/plugin/indexer-solr/plugin.xml +++ b/src/plugin/indexer-solr/plugin.xml @@ -1,14 +1,20 @@ - + diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml index 68e9ed76e1..f64b97055b 100644 --- a/src/plugin/language-identifier/ivy.xml +++ b/src/plugin/language-identifier/ivy.xml @@ -1,5 +1,4 @@ - + # Updates * The use of phantomjs has been deprecated. Check [Wikipedia](https://en.wikipedia.org/wiki/PhantomJS) for more info. * The updated code for Safari webriver is under development as starting Safari 10 on OS X El Capitan and macOS Sierra, Safari comes bundled with a new driver implementation. diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.md b/src/plugin/lib-selenium/howto_upgrade_selenium.md new file mode 100644 index 0000000000..3071c74cbf --- /dev/null +++ b/src/plugin/lib-selenium/howto_upgrade_selenium.md @@ -0,0 +1,32 @@ + + +1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml + +2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml + + To get a list of dependencies and their versions execute: + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ \n \n <\/library>/g' + + Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). + + N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows + + $ brew install gnu-sed --with-default-names + + You can then restart your terminal and the Regex + Sed command should work just fine! diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt b/src/plugin/lib-selenium/howto_upgrade_selenium.txt deleted file mode 100644 index 1892a6275e..0000000000 --- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt +++ /dev/null @@ -1,15 +0,0 @@ -1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml - -2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml - - To get a list of dependencies and their versions execute: - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ \n \n <\/library>/g' - - Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). - - N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows - - $ brew install gnu-sed --with-default-names - - You can then restart your terminal and the Regex + Sed command should work just fine! diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml index 7d3a2d6242..0d460cdb4d 100644 --- a/src/plugin/lib-selenium/ivy.xml +++ b/src/plugin/lib-selenium/ivy.xml @@ -1,5 +1,4 @@ - diff --git a/src/plugin/parse-js/sample/parse_pure_js_test.js b/src/plugin/parse-js/sample/parse_pure_js_test.js index f196313f85..0e486a8793 100644 --- a/src/plugin/parse-js/sample/parse_pure_js_test.js +++ b/src/plugin/parse-js/sample/parse_pure_js_test.js @@ -1,3 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // test data for link extraction from "pure" JavaScript function selectProvider(form) { diff --git a/src/plugin/parse-metatags/ivy.xml b/src/plugin/parse-metatags/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/parse-metatags/ivy.xml +++ b/src/plugin/parse-metatags/ivy.xml @@ -1,5 +1,4 @@ - diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html index ca8b737c2b..36d2c8814a 100644 --- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html +++ b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.md similarity index 73% rename from src/plugin/parse-tika/howto_upgrade_tika.txt rename to src/plugin/parse-tika/howto_upgrade_tika.md index 46d075948b..8ed6c3f3cd 100644 --- a/src/plugin/parse-tika/howto_upgrade_tika.txt +++ b/src/plugin/parse-tika/howto_upgrade_tika.md @@ -1,3 +1,20 @@ + + We are currently using a shim (https://github.com/tballison/hadoop-safe-tika because of binary conflicts in commons-io versions between what Hadoop supports and the more modern features that Apache Tika and Apache POI were using in commons-io. diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index 1586d9661f..b89e812e18 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -1,5 +1,4 @@ - diff --git a/src/plugin/parse-zip/ivy.xml b/src/plugin/parse-zip/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/parse-zip/ivy.xml +++ b/src/plugin/parse-zip/ivy.xml @@ -1,5 +1,4 @@ - + Nutch Interactive Selenium ========================== diff --git a/src/plugin/protocol-interactiveselenium/ivy.xml b/src/plugin/protocol-interactiveselenium/ivy.xml index 506be0aecb..112483bcdc 100644 --- a/src/plugin/protocol-interactiveselenium/ivy.xml +++ b/src/plugin/protocol-interactiveselenium/ivy.xml @@ -1,5 +1,4 @@ - + 1. Upgrade OkHttp dependency in src/plugin/protocol-okhttp/ivy.xml 2. Upgrade OkHttp's own dependencies in src/plugin/protocol-okhttp/plugin.xml diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml index ead8232474..73b4fa6369 100644 --- a/src/plugin/protocol-okhttp/ivy.xml +++ b/src/plugin/protocol-okhttp/ivy.xml @@ -1,5 +1,4 @@ - + Nutch Selenium ============== diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml index 506be0aecb..112483bcdc 100644 --- a/src/plugin/protocol-selenium/ivy.xml +++ b/src/plugin/protocol-selenium/ivy.xml @@ -1,5 +1,4 @@ - Filters URLs based on a file of regular expressions using host/domains matching first. The default policy is to accept a URL if no matches diff --git a/src/plugin/urlfilter-fast/ivy.xml b/src/plugin/urlfilter-fast/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/urlfilter-fast/ivy.xml +++ b/src/plugin/urlfilter-fast/ivy.xml @@ -1,5 +1,4 @@ - + urlfilter-ignoreexempt ====================== This plugin allows certain urls to be exempted when the external links are configured to be ignored. diff --git a/src/plugin/urlfilter-ignoreexempt/ivy.xml b/src/plugin/urlfilter-ignoreexempt/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/urlfilter-ignoreexempt/ivy.xml +++ b/src/plugin/urlfilter-ignoreexempt/ivy.xml @@ -1,5 +1,4 @@ - diff --git a/src/test/filter-all.txt b/src/test/filter-all.txt index 4ed567ab1c..d738aec76a 100644 --- a/src/test/filter-all.txt +++ b/src/test/filter-all.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Config file for urlfilter-suffix plugin # Filter away all urls diff --git a/src/test/log4j.properties b/src/test/log4j.properties index 3ff115f46f..08e272c712 100644 --- a/src/test/log4j.properties +++ b/src/test/log4j.properties @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # log4j configuration used during build and unit tests log4j.rootLogger=info,stdout diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml index dd408739dc..0d6177e5e6 100644 --- a/src/test/nutch-site.xml +++ b/src/test/nutch-site.xml @@ -1,4 +1,20 @@ + diff --git a/src/testresources/fetch-test-site/dup_of_pagea.html b/src/testresources/fetch-test-site/dup_of_pagea.html index 6444c41225..63c4e61537 100644 --- a/src/testresources/fetch-test-site/dup_of_pagea.html +++ b/src/testresources/fetch-test-site/dup_of_pagea.html @@ -1,3 +1,19 @@ + page a diff --git a/src/testresources/fetch-test-site/exception.html b/src/testresources/fetch-test-site/exception.html index e1192a176b..66f134ee25 100644 --- a/src/testresources/fetch-test-site/exception.html +++ b/src/testresources/fetch-test-site/exception.html @@ -1,3 +1,19 @@ + diff --git a/src/testresources/fetch-test-site/index.html b/src/testresources/fetch-test-site/index.html index d73ff3f691..3fc6e61e5a 100644 --- a/src/testresources/fetch-test-site/index.html +++ b/src/testresources/fetch-test-site/index.html @@ -1,3 +1,19 @@ + front page diff --git a/src/testresources/fetch-test-site/nested_spider_trap.html b/src/testresources/fetch-test-site/nested_spider_trap.html index 5dcf7c2209..dd32ee2362 100644 --- a/src/testresources/fetch-test-site/nested_spider_trap.html +++ b/src/testresources/fetch-test-site/nested_spider_trap.html @@ -1,3 +1,19 @@ + nested spider trap diff --git a/src/testresources/fetch-test-site/pagea.html b/src/testresources/fetch-test-site/pagea.html index 6444c41225..63c4e61537 100644 --- a/src/testresources/fetch-test-site/pagea.html +++ b/src/testresources/fetch-test-site/pagea.html @@ -1,3 +1,19 @@ + page a diff --git a/src/testresources/fetch-test-site/pageb.html b/src/testresources/fetch-test-site/pageb.html index 66e3725ef0..cf77ff4f75 100644 --- a/src/testresources/fetch-test-site/pageb.html +++ b/src/testresources/fetch-test-site/pageb.html @@ -1,3 +1,19 @@ + bage b diff --git a/src/testresources/fetch-test-site/robots.txt b/src/testresources/fetch-test-site/robots.txt index e69de29bb2..fc590f9733 100644 --- a/src/testresources/fetch-test-site/robots.txt +++ b/src/testresources/fetch-test-site/robots.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file