Skip to content

Commit

Permalink
Merge upstream branch 'master' into cc
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastian-nagel committed Jan 8, 2025
2 parents 6b2d9ea + 3b6d2c6 commit fcb6640
Show file tree
Hide file tree
Showing 13 changed files with 115 additions and 55 deletions.
11 changes: 9 additions & 2 deletions .github/workflows/master-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,18 @@ jobs:
- 'src/testresources/**'
plugins:
- 'src/plugin/**'
buildconf:
- 'build.xml'
- 'ivy/ivy.xml'
# run if the build configuration or both 'core' and 'plugins' files were changed
- name: test all
if: ${{ steps.filter.outputs.buildconf == 'true' || ( steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'true' ) }}
run: ant clean test -buildfile build.xml
# run only if 'core' files were changed
- name: test core
if: steps.filter.outputs.core == 'true'
if: ${{ steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-core -buildfile build.xml
# run only if 'plugins' files were changed
- name: test plugins
if: steps.filter.outputs.plugins == 'true'
if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-plugins -buildfile build.xml
3 changes: 3 additions & 0 deletions src/bin/nutch
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ if [ $# = 0 ]; then
echo " indexchecker check the indexing filters for a given url"
echo " filterchecker check url filters for a given url"
echo " normalizerchecker check url normalizers for a given url"
echo " robotsparser parse a robots.txt file and check whether urls are allowed or not"
echo " domainstats calculate domain statistics from crawldb"
echo " protocolstats calculate protocol status code stats from crawldb"
echo " crawlcomplete calculate crawl completion stats from crawldb"
Expand Down Expand Up @@ -271,6 +272,8 @@ elif [ "$COMMAND" = "filterchecker" ] ; then
CLASS=org.apache.nutch.net.URLFilterChecker
elif [ "$COMMAND" = "normalizerchecker" ] ; then
CLASS=org.apache.nutch.net.URLNormalizerChecker
elif [ "$COMMAND" = "robotsparser" ] ; then
CLASS=org.apache.nutch.protocol.RobotRulesParser
elif [ "$COMMAND" = "domainstats" ] ; then
CLASS=org.apache.nutch.util.DomainStatistics
elif [ "$COMMAND" = "protocolstats" ] ; then
Expand Down
23 changes: 20 additions & 3 deletions src/java/org/apache/nutch/hostdb/ResolverThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,32 @@ public void run() {
}
}

context.getCounter("UpdateHostDb",
Long.toString(datum.numFailures()) + "_times_failed").increment(1);
context.getCounter("UpdateHostDb", createFailureCounterLabel(datum)).increment(1);
} catch (Exception ioe) {
LOG.warn(StringUtils.stringifyException(ioe));
}
} catch (Exception e) {
LOG.warn(StringUtils.stringifyException(e));
}

context.getCounter("UpdateHostDb", "checked_hosts").increment(1);
}

private String createFailureCounterLabel(HostDatum datum) {
// Hadoop will allow no more than 120 distinct counters. If we have a large
// number of distinct failures, we'll exceed the limit, Hadoop will complain,
// the job will fail. Let's limit the amount of possibilities by grouping
// the numFailures in buckets. NUTCH-3096
String label = null;
long n = datum.numFailures();
if (n < 4) {
label = Long.toString(n);
} else if (n > 3 && n < 11) {
label = "4-10";
} else {
label = ">10";
}

return label + "_times_failed";
}
}
96 changes: 69 additions & 27 deletions src/java/org/apache/nutch/segment/SegmentReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public class SegmentReader extends Configured implements Tool {

public static class InputCompatMapper extends
Mapper<WritableComparable<?>, Writable, Text, NutchWritable> {

private Text newKey = new Text();

@Override
Expand Down Expand Up @@ -195,6 +195,28 @@ public void reduce(Text key, Iterable<NutchWritable> values,
}
}

private static boolean segmSubdirExists(Configuration conf, Path segment,
String subDir) throws IOException {
Path segmSubPath = new Path(segment, subDir);
boolean exists = segmSubPath.getFileSystem(conf).exists(segmSubPath);
if (!exists) {
LOG.warn("Segment subdirectory {} does not exist in {}!", subDir,
segment);
}
return exists;
}

private static void addSegmSubDirIfExists(List<Path> inputDirs, Configuration conf,
Path segment, String subDir) throws IOException {
Path segmSubPath = new Path(segment, subDir);
if (segmSubPath.getFileSystem(conf).exists(segmSubPath)) {
inputDirs.add(segmSubPath);
} else {
LOG.warn("Segment subdirectory {} does not exist in {} - skipping!", subDir,
segment);
}
}

public void dump(Path segment, Path output) throws IOException,
InterruptedException, ClassNotFoundException {

Expand All @@ -203,21 +225,36 @@ public void dump(Path segment, Path output) throws IOException,
Job job = Job.getInstance(getConf(), "Nutch SegmentReader: " + segment);
Configuration conf = job.getConfiguration();

if (ge)
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.GENERATE_DIR_NAME));
if (fe)
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.FETCH_DIR_NAME));
if (pa)
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.PARSE_DIR_NAME));
if (co)
FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
if (pd)
FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
if (pt)
FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
List<Path> inputDirs = new ArrayList<>();
if (ge) {
addSegmSubDirIfExists(inputDirs, conf, segment,
CrawlDatum.GENERATE_DIR_NAME);
}
if (fe) {
addSegmSubDirIfExists(inputDirs, conf, segment,
CrawlDatum.FETCH_DIR_NAME);
}
if (pa) {
addSegmSubDirIfExists(inputDirs, conf, segment,
CrawlDatum.PARSE_DIR_NAME);
}
if (co) {
addSegmSubDirIfExists(inputDirs, conf, segment, Content.DIR_NAME);
}
if (pd) {
addSegmSubDirIfExists(inputDirs, conf, segment, ParseData.DIR_NAME);
}
if (pt) {
addSegmSubDirIfExists(inputDirs, conf, segment, ParseText.DIR_NAME);
}
if (inputDirs.isEmpty()) {
String msg = "No segment subdirectories defined as input";
LOG.error(msg);
throw new RuntimeException(msg);
}
for (Path p : inputDirs) {
FileInputFormat.addInputPath(job, p);
}

job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(InputCompatMapper.class);
Expand All @@ -243,7 +280,7 @@ public void dump(Path segment, Path output) throws IOException,
}
} catch (IOException | InterruptedException | ClassNotFoundException e ){
LOG.error(StringUtils.stringifyException(e));
throw e;
throw e;
}

// concatenate the output
Expand Down Expand Up @@ -307,7 +344,7 @@ public void get(final Path segment, final Text key, Writer writer,
final Map<String, List<Writable>> results) throws Exception {
LOG.info("SegmentReader: get '{}'", key);
ArrayList<Thread> threads = new ArrayList<>();
if (co)
if (co && segmSubdirExists(getConf(), segment, Content.DIR_NAME))
threads.add(new Thread() {
@Override
public void run() {
Expand All @@ -320,7 +357,7 @@ public void run() {
}
}
});
if (fe)
if (fe && segmSubdirExists(getConf(), segment, CrawlDatum.FETCH_DIR_NAME))
threads.add(new Thread() {
@Override
public void run() {
Expand All @@ -333,7 +370,8 @@ public void run() {
}
}
});
if (ge)
if (ge
&& segmSubdirExists(getConf(), segment, CrawlDatum.GENERATE_DIR_NAME))
threads.add(new Thread() {
@Override
public void run() {
Expand All @@ -346,7 +384,7 @@ public void run() {
}
}
});
if (pa)
if (pa && segmSubdirExists(getConf(), segment, CrawlDatum.PARSE_DIR_NAME))
threads.add(new Thread() {
@Override
public void run() {
Expand All @@ -359,7 +397,7 @@ public void run() {
}
}
});
if (pd)
if (pd && segmSubdirExists(getConf(), segment, ParseData.DIR_NAME))
threads.add(new Thread() {
@Override
public void run() {
Expand All @@ -372,7 +410,7 @@ public void run() {
}
}
});
if (pt)
if (pt && segmSubdirExists(getConf(), segment, ParseText.DIR_NAME))
threads.add(new Thread() {
@Override
public void run() {
Expand All @@ -386,6 +424,10 @@ public void run() {
}
});
Iterator<Thread> it = threads.iterator();
if (!it.hasNext()) {
LOG.error("No segment subdirectories specified as input!");
return;
}
while (it.hasNext())
it.next().start();
int cnt;
Expand Down Expand Up @@ -476,7 +518,7 @@ private List<Writable> getSeqRecords(Path dir, Text key) throws Exception {
* {@link Metadata#CONTENT_ENCODING} then fallback
* {@link java.nio.charset.StandardCharsets#UTF_8}
* @param parseMeta a populated {@link Metadata}
* @return {@link Charset}
* @return {@link Charset}
*/
public static Charset getCharset(Metadata parseMeta) {
Charset cs = StandardCharsets.UTF_8;
Expand Down Expand Up @@ -548,7 +590,7 @@ public void getStats(Path segment, final SegmentReaderStats stats)
Text key = new Text();
CrawlDatum val = new CrawlDatum();
FileSystem fs = segment.getFileSystem(getConf());

if (ge) {
SequenceFile.Reader[] readers = SegmentReaderUtil.getReaders(
new Path(segment, CrawlDatum.GENERATE_DIR_NAME), getConf());
Expand All @@ -559,7 +601,7 @@ public void getStats(Path segment, final SegmentReaderStats stats)
}
stats.generated = cnt;
}

if (fe) {
Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDirectory()) {
Expand All @@ -584,7 +626,7 @@ public void getStats(Path segment, final SegmentReaderStats stats)
stats.fetched = cnt;
}
}

if (pd) {
Path parseDir = new Path(segment, ParseData.DIR_NAME);
if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDirectory()) {
Expand Down
9 changes: 1 addition & 8 deletions src/plugin/indexer-elastic/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,7 @@
</publications>

<dependencies>
<dependency org="org.elasticsearch.client" name="elasticsearch-rest-high-level-client" rev="7.10.2">
<!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
<exclude org="commons-codec" name="commons-codec" />
<exclude org="commons-logging" name="commons-logging" />
<exclude org="com.tdunning" name="t-digest" />
<exclude org="org.apache.logging.log4j" name="log4j-api" />
<exclude org="org.apache.lucene" name="*"/>
</dependency>
<dependency org="org.elasticsearch.client" name="elasticsearch-rest-high-level-client" rev="7.10.2"/>
<dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="8.11.2"/>
<dependency org="org.apache.lucene" name="lucene-backward-codecs" rev="8.11.2"/>
<dependency org="org.apache.lucene" name="lucene-core" rev="8.11.2"/>
Expand Down
4 changes: 4 additions & 0 deletions src/plugin/indexer-elastic/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
<!-- Elastic Rest Client Dependencies -->
<!-- end of Elastic Rest Client dependencies -->
<library name="aggs-matrix-stats-client-7.10.2.jar"/>
<library name="commons-codec-1.11.jar"/>
<library name="commons-logging-1.1.3.jar"/>
<library name="compiler-0.9.6.jar"/>
<library name="elasticsearch-7.10.2.jar"/>
<library name="elasticsearch-cli-7.10.2.jar"/>
Expand All @@ -46,6 +48,7 @@
<library name="joda-time-2.10.4.jar"/>
<library name="jopt-simple-5.0.2.jar"/>
<library name="lang-mustache-client-7.10.2.jar"/>
<library name="log4j-api-2.11.1.jar"/>
<library name="lucene-analyzers-common-8.11.2.jar"/>
<library name="lucene-backward-codecs-8.11.2.jar"/>
<library name="lucene-core-8.11.2.jar"/>
Expand All @@ -66,6 +69,7 @@
<library name="s2-geometry-library-java-1.0.0.jar"/>
<library name="snakeyaml-1.26.jar"/>
<library name="spatial4j-0.7.jar"/>
<library name="t-digest-3.2.jar"/>
</runtime>
<requires>
<import plugin="nutch-extensionpoints" />
Expand Down
4 changes: 2 additions & 2 deletions src/plugin/parsefilter-debug/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
<import plugin="nutch-extensionpoints"/>
</requires>

<extension id="org.apache.nutch.htmlparsefilter.regex"
name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
<extension id="org.apache.nutch.htmlparsefilter.debug"
name="Nutch Debug Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="DebugParseFilter"
class="org.apache.nutch.parsefilter.debug.DebugParseFilter">
</implementation>
Expand Down
2 changes: 1 addition & 1 deletion src/plugin/parsefilter-naivebayes/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
</requires>

<extension id="org.apache.nutch.htmlparsefilter.naivebayes"
name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
name="Nutch NaiveBayes Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="NaiveBayesHTMLParseFilter"
class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
</extension>
Expand Down
2 changes: 1 addition & 1 deletion src/plugin/parsefilter-regex/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
</requires>

<extension id="org.apache.nutch.htmlparsefilter.regex"
name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
name="Nutch Regex Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="RegexParseFilter"
class="org.apache.nutch.parsefilter.regex.RegexParseFilter">
<parameter name="file" value="regex-parsefilter.txt"/>
Expand Down
4 changes: 2 additions & 2 deletions src/plugin/protocol-htmlunit/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
<import plugin="lib-htmlunit"/>
</requires>

<extension id="org.apache.nutch.protocol.http"
name="HttpProtocol"
<extension id="org.apache.nutch.protocol.htmlunit"
name="HtmlUnitHttpProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.htmlunit.Http"
Expand Down
8 changes: 1 addition & 7 deletions src/plugin/protocol-httpclient/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,14 @@
</requires>

<extension id="org.apache.nutch.protocol.httpclient"
name="HttpProtocol"
name="HttpClientProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.httpclient.Http"
class="org.apache.nutch.protocol.httpclient.Http">
<parameter name="protocolName" value="http"/>
</implementation>

</extension>

<extension id="org.apache.nutch.protocol.https"
name="HttpsProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.httpclient.Http"
class="org.apache.nutch.protocol.httpclient.Http">
<parameter name="protocolName" value="https"/>
Expand Down
2 changes: 1 addition & 1 deletion src/plugin/protocol-interactiveselenium/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
</requires>

<extension id="org.apache.nutch.protocol.interactiveselenium"
name="HttpProtocol"
name="InteractiveSeleniumHttpProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.interactiveselenium.Http"
Expand Down
2 changes: 1 addition & 1 deletion src/plugin/protocol-selenium/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
</requires>

<extension id="org.apache.nutch.protocol.selenium"
name="HttpProtocol"
name="SeleniumHttpProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.selenium.Http"
Expand Down

0 comments on commit fcb6640

Please sign in to comment.