From eca68e7e44dff5f037d80ae8386a8ecd547a01c4 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 21 Jun 2024 17:23:31 -0400 Subject: [PATCH] feat: HTML5::DocumentFragment.parse and .new take a :context kwarg - deprecate positional options hash in .parse - improve documentation for .parse --- CHANGELOG.md | 2 + lib/nokogiri/css.rb | 2 +- lib/nokogiri/html5.rb | 12 ++--- lib/nokogiri/html5/document_fragment.rb | 59 ++++++++++++++++++++----- test/html5/test_api.rb | 38 ++++++++++++++++ 5 files changed, 95 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6dff029b7..ec6cd2eccf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA * Documentation has been improved for `CSS.xpath_for`. [#3224] @flavorjones * [CRuby] When compiling packaged libraries from source, allow users' `AR` and `LD` environment variables to set the archiver and linker commands, respectively. This augments the existing `CC` environment variable to set the compiler command. [#3165] @ziggythehamster * [CRuby] The HTML5 parse methods accept a `:parse_noscript_content_as_text` keyword argument which will emulate the parsing behavior of a browser which has scripting enabled. [#3178, #3231] @stevecheckoway +* [CRuby] `HTML5::DocumentFragment.parse` and `.new` accept a `:context` keyword argument that is the parse context node or element name. Previously this could only be passed in as a positional argument to `.new` and not at all to `.parse`. @flavorjones ### Fixed @@ -49,6 +50,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA * The undocumented and unused method `Nokogiri::CSS.parse` is now deprecated and will generate a warning. The AST returned by this method is private and subject to change and removal in future versions of Nokogiri. This method will be removed in a future version of Nokogiri. * Passing an options hash to `CSS.xpath_for` is now deprecated and will generate a warning. Use keyword arguments instead. This will become an error in a future version of Nokogiri. +* Passing an options hash to `HTML5::DocumentFragment.parse` is now deprecated and will generate a warning. Use keyword arguments instead. This will become an error in a future version of Nokogiri. ## v1.16.6 / 2024-06-13 diff --git a/lib/nokogiri/css.rb b/lib/nokogiri/css.rb index b8104ca475..c3fe72ec9c 100644 --- a/lib/nokogiri/css.rb +++ b/lib/nokogiri/css.rb @@ -88,7 +88,7 @@ def xpath_for( cache: true ) unless options.nil? - warn("Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated) + warn("Nokogiri::CSS.xpath_for: Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated) end raise(TypeError, "no implicit conversion of #{selector.inspect} to String") unless selector.respond_to?(:to_str) diff --git a/lib/nokogiri/html5.rb b/lib/nokogiri/html5.rb index 1465e265fe..4e8397dbb2 100644 --- a/lib/nokogiri/html5.rb +++ b/lib/nokogiri/html5.rb @@ -249,11 +249,13 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # == Notes # - # * The Nokogiri::HTML5.fragment function takes a string and parses it as a HTML5 document. The - # +html+, +head+, and +body+ elements are removed from this document, and any children of these - # elements that remain are returned as a Nokogiri::HTML5::DocumentFragment. + # * The Nokogiri::HTML5.fragment function takes a String or IO and parses it as a HTML5 document + # in a +body+ context. As a result, the +html+, +head+, and +body+ elements are removed from + # this document, and any children of these elements that remain are returned as a + # Nokogiri::HTML5::DocumentFragment; but you can pass in a different context (e.g., "html" to + # get +head+ and +body+ tags in the result). # - # * The Nokogiri::HTML5.parse function takes a string and passes it to the + # * The Nokogiri::HTML5.parse function takes a String or IO and passes it to the # gumbo_parse_with_options method, using the default options. The resulting Gumbo # parse tree is then walked. # @@ -273,7 +275,7 @@ def parse(string, url = nil, encoding = nil, **options, &block) # Parse a fragment from +string+. Convenience method for # Nokogiri::HTML5::DocumentFragment.parse. def fragment(string, encoding = nil, **options) - DocumentFragment.parse(string, encoding, options) + DocumentFragment.parse(string, encoding, **options) end # :nodoc: diff --git a/lib/nokogiri/html5/document_fragment.rb b/lib/nokogiri/html5/document_fragment.rb index 795ea0f33b..935ac6491d 100644 --- a/lib/nokogiri/html5/document_fragment.rb +++ b/lib/nokogiri/html5/document_fragment.rb @@ -25,6 +25,47 @@ module HTML5 # # 💡 HTML5 functionality is not available when running JRuby. class DocumentFragment < Nokogiri::HTML4::DocumentFragment + class << self + # :call-seq: + # parse(tags, **options) + # parse(tags, encoding = nil, **options) + # + # Parse an HTML5 document fragment from +tags+, returning a Nodeset. + # + # [Parameters] + # - +tags+ [String, IO] The HTML5 document fragment to parse. + # - +encoding+ [String] The name of the encoding to use when parsing the document fragment. (default +nil+) + # + # Also see Nokogiri::HTML5 for a longer explanation of how encoding is handled by the parser. + # + # [Options] + # - +:context+ [String, Nokogiri::XML::Node] The context in which to parse the document fragment. (default +"body"+) + # - +:max_errors+ [Integer] The maximum number of parse errors to record. (default +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0) + # - +:max_tree_depth+ [Integer] The maximum depth of the parse tree. (default +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) + # - +:max_attributes+ [Integer] The maximum number of attributes allowed on an element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) + # - +:parse_noscript_content_as_text+ [Boolean] Whether to parse the content of +noscript+ elements as text. (default +false+) + # + # Also see Nokogiri::HTML5 for a longer explanation of the options. + # + # [Returns] + # - [Nokogiri::XML::NodeSet] A node set containing the root nodes of the parsed fragment. + # + def parse(tags, encoding = nil, positional_options_hash = nil, **options) + unless positional_options_hash.nil? + warn("Nokogiri::HTML5::DocumentFragment.parse: Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated) + options.merge!(positional_options_hash) + end + + context = options.delete(:context) + + document = HTML5::Document.new + document.encoding = "UTF-8" + tags = HTML5.read_and_encode(tags, encoding) + + new(document, tags, context, options) + end + end + attr_accessor :document attr_accessor :errors @@ -36,18 +77,20 @@ class DocumentFragment < Nokogiri::HTML4::DocumentFragment attr_reader :quirks_mode # Create a document fragment. - def initialize(doc, tags = nil, ctx = nil, options = {}) # rubocop:disable Lint/MissingSuper - self.document = doc - self.errors = [] + def initialize(doc, tags = nil, context = nil, options = {}) # rubocop:disable Lint/MissingSuper + @document = doc + @errors = [] return self unless tags tags = Nokogiri::HTML5.read_and_encode(tags, nil) + context = options.delete(:context) if options.key?(:context) + options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH - Nokogiri::Gumbo.fragment(self, tags, ctx, **options) + Nokogiri::Gumbo.fragment(self, tags, context, **options) end def serialize(options = {}, &block) # :nodoc: @@ -56,14 +99,6 @@ def serialize(options = {}, &block) # :nodoc: XML::Node.instance_method(:serialize).bind_call(self, options, &block) end - # Parse a document fragment from +tags+, returning a Nodeset. - def self.parse(tags, encoding = nil, options = {}) - doc = HTML5::Document.new - tags = HTML5.read_and_encode(tags, encoding) - doc.encoding = "UTF-8" - new(doc, tags, nil, options) - end - def extract_params(params) # :nodoc: handler = params.find do |param| ![Hash, String, Symbol].include?(param.class) diff --git a/test/html5/test_api.rb b/test/html5/test_api.rb index 93c5fb7783..acc4800946 100644 --- a/test/html5/test_api.rb +++ b/test/html5/test_api.rb @@ -398,6 +398,44 @@ def initialize(*args) end describe Nokogiri::HTML5::DocumentFragment do + describe "passing in context node" do + it "to DocumentFragment.new" do + fragment = Nokogiri::HTML5::DocumentFragment.new( + Nokogiri::HTML5::Document.new, + "
foo
", + "html", + ) + assert_match(//, fragment.to_s) + assert_match(//, fragment.to_s) + end + + describe "to DocumentFragment.parse" do + it "as an options hash" do + assert_output(nil, /Passing options as an explicit hash is deprecated/) do + fragment = Nokogiri::HTML5::DocumentFragment.parse( + "
foo
", + nil, + { context: "html" }, + ) + assert_match(//, fragment.to_s) + assert_match(//, fragment.to_s) + end + end + + it "as keyword argument" do + fragment = Nokogiri::HTML5::DocumentFragment.parse("
foo
", context: "html") + assert_match(//, fragment.to_s) + assert_match(//, fragment.to_s) + end + end + + it "to HTML5.fragment" do + fragment = Nokogiri::HTML5.fragment("
foo
", context: "html") + assert_match(//, fragment.to_s) + assert_match(//, fragment.to_s) + end + end + describe "subclassing" do let(:klass) do Class.new(Nokogiri::HTML5::DocumentFragment) do