Skip to content

Commit

Permalink
Merge branch 'main' of github.com:BaseXdb/basex
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristianGruen committed Feb 24, 2025
2 parents d791bda + 7fadca9 commit b388cb8
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 33 deletions.
43 changes: 16 additions & 27 deletions basex-core/src/main/java/org/basex/build/html/HtmlParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
* is passed on.
*
* TagSoup was written by John Cowan and is based on the Apache 2.0 License:
* {@code http://home.ccil.org/~cowan/XML/tagsoup/}.
* {@code http://vrici.lojban.org/~cowan/tagsoup/}
*
* The Validator.nu HTML parser was written by Henri Sivonen and is based on the MIT License:
* {@code https://about.validator.nu/htmlparser/}.
Expand All @@ -49,7 +49,7 @@ public HtmlParser(final IO source, final MainOptions options, final HtmlOptions
/**
* Constructor.
* @param source document source
* @param parser parser to be used
* @param parser parser to be used (can be {@code null})
* @param options main options
* @param hopts html options
* @throws IOException I/O exception
Expand All @@ -62,15 +62,15 @@ public HtmlParser(final IO source, final Parser parser, final MainOptions option
/**
* Converts an HTML document to XML.
* @param io io reference
* @param parser parser to be used
* @param parser parser to be used (can be {@code null})
* @param hopts html options
* @return parser
* @throws IOException I/O exception
*/
private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopts)
throws IOException {
// reader could not be initialized; fall back to XML
if(!parser.available(hopts)) return io;
// parser unavailable: fall back to XML
if(parser == null) return io;
try {
// define output
final StringWriter sw = new StringWriter();
Expand All @@ -81,7 +81,7 @@ private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopt
final String enc = io.encoding() != null
? io.encoding()
: hopts.contains(ENCODING)
? hopts.get(HtmlOptions.ENCODING)
? hopts.get(ENCODING)
: null;
if(enc != null) {
if(!Strings.supported(enc)) {
Expand Down Expand Up @@ -126,11 +126,6 @@ public enum Parser {
/** TagSoup URL. */
private static final String FEATURES = "http://www.ccil.org/~cowan/tagsoup/features/";

@Override
public boolean fallbackToXml() {
return true;
}

@Override
XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXException {
final XMLReader reader = new org.ccil.cowan.tagsoup.Parser();
Expand Down Expand Up @@ -183,11 +178,6 @@ XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXExcep
private static final String CHARDET_CLASS_NAME =
"org.mozilla.intl.chardet.nsICharsetDetectionObserver";

@Override
public boolean fallbackToXml() {
return false;
}

@Override
XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXException {
final nu.validator.htmlparser.sax.HtmlParser reader =
Expand Down Expand Up @@ -266,20 +256,18 @@ public boolean available(final HtmlOptions options) {
}
};

/** Default parser. */
public static final Parser DEFAULT = TAGSOUP;
/** The default parser: TAGSOUP if available, NU if available, {@code null} otherwise. */
public static final Parser DEFAULT;
static {
final HtmlOptions opts = new HtmlOptions();
DEFAULT = TAGSOUP.available(opts) ? TAGSOUP : NU.available(opts) ? NU : null;
}

/** String representation. */
private final String string;
/** Required classes. */
private final String[] classes;

/**
* Whether to fall back to XML if this parser is not available.
* @return result of check
*/
public abstract boolean fallbackToXml();

/**
* Return a reader instance for this parser.
* @param options HTML options
Expand Down Expand Up @@ -343,10 +331,11 @@ public static Parser of(final HtmlOptions options) {
}

/**
* Returns the parser associated with the specified HTML options.
* Returns the parser associated with the specified HTML options, if any, or the specified
* default parser.
* @param options HTML options.
* @param defaultParser default parser
* @return parser
* @param defaultParser default parser (can be {@code null})
* @return parser (can be {@code null})
*/
public static Parser of(final HtmlOptions options, final Parser defaultParser) {
return options.contains(METHOD) ? options.get(METHOD).parser : defaultParser;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ protected final Expr opt(final CompileContext cc) {
/**
* Parses the input and creates an XML document.
* @param io input data
* @param defaultParser default HTML parser to be used in absence of the METHOD option
* @param defaultParser default HTML parser to be used in absence of the METHOD option (can be
* {@code null})
* @param qc query context
* @return node
* @throws QueryException query exception
Expand All @@ -60,7 +61,7 @@ protected final Item parse(final IO io, final Parser defaultParser, final QueryC
if(io == null) return Empty.VALUE;
final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), qc);
final Parser parser = Parser.of(options, defaultParser);
if(!parser.fallbackToXml()) parser.ensureAvailable(options, definition.local(), info);
if(parser != null) parser.ensureAvailable(options, definition.local(), info);
try {
return new DBNode(
new org.basex.build.html.HtmlParser(io, parser, new MainOptions(), options));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.basex.query.func.html;

import org.basex.build.html.*;
import org.basex.build.html.HtmlParser.*;
import org.basex.query.*;
import org.basex.query.func.*;
Expand All @@ -16,8 +15,7 @@
public final class HtmlParser extends StandardFunc {
@Override
public Item item(final QueryContext qc, final InputInfo ii) {
final HtmlOptions options = new HtmlOptions();
final Parser parser = Parser.of(options);
return Str.get(parser.available(options) ? parser.toString() : "");
final Parser parser = Parser.DEFAULT;
return Str.get(parser != null ? parser.toString() : "");
}
}

0 comments on commit b388cb8

Please sign in to comment.