From 2bafd9f2707b9cbc0d79df6c5e35747ae3f04a55 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 28 Feb 2023 12:52:33 -0500 Subject: [PATCH 1/4] test: add encoding coverage for all serialization methods See #2774 and #2798 --- test/test_serialization_encoding.rb | 98 +++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 test/test_serialization_encoding.rb diff --git a/test/test_serialization_encoding.rb b/test/test_serialization_encoding.rb new file mode 100644 index 0000000000..2a1af503fe --- /dev/null +++ b/test/test_serialization_encoding.rb @@ -0,0 +1,98 @@ +# coding: utf-8 +# frozen_string_literal: true + +require "helper" + +class TestSerializationEncoding < Nokogiri::TestCase + def round_trip_through_file + Tempfile.create do |io| + yield io + io.rewind + io.read + end + end + + describe "serialization encoding" do + matrix = [ + { + klass: Nokogiri::XML::Document, + documents: [ + { encoding: Encoding::UTF_8, path: ADDRESS_XML_FILE }, + { encoding: Encoding::Shift_JIS, path: SHIFT_JIS_XML }, + ], + }, + { + klass: Nokogiri::HTML4::Document, + documents: [ + { encoding: Encoding::UTF_8, path: HTML_FILE }, + { encoding: Encoding::Shift_JIS, path: SHIFT_JIS_HTML }, + ], + }, + ] + if Nokogiri.uses_gumbo? + matrix << { + klass: Nokogiri::HTML5::Document, + documents: [ + { encoding: Encoding::UTF_8, path: HTML_FILE }, + { encoding: Encoding::Shift_JIS, path: SHIFT_JIS_HTML }, + ], + } + end + + matrix.each do |matrix_entry| + describe matrix_entry[:klass] do + let(:klass) { matrix_entry[:klass] } + matrix_entry[:documents].each do |document| + describe document[:encoding] do + it "serializes with the expected encoding" do + doc = klass.parse( + File.read( + document[:path], + encoding: document[:encoding], + ), + ) + + expected_default_encoding = + if defined?(Nokogiri::HTML5::Document) && klass == Nokogiri::HTML5::Document + Encoding::UTF_8 # FIXME: see #2801, this should be document[:encoding] + else + document[:encoding] + end + + assert_equal(expected_default_encoding, doc.to_s.encoding) + + assert_equal(expected_default_encoding, doc.to_xml.encoding) + assert_equal(Encoding::UTF_8, doc.to_xml(encoding: "UTF-8").encoding) + assert_equal(Encoding::Shift_JIS, doc.to_xml(encoding: "SHIFT_JIS").encoding) + + assert_equal(expected_default_encoding, doc.to_xhtml.encoding) + assert_equal(Encoding::UTF_8, doc.to_xhtml(encoding: "UTF-8").encoding) + assert_equal(Encoding::Shift_JIS, doc.to_xhtml(encoding: "SHIFT_JIS").encoding) + + assert_equal(expected_default_encoding, doc.to_html.encoding) + assert_equal(Encoding::UTF_8, doc.to_html(encoding: "UTF-8").encoding) + assert_equal(Encoding::Shift_JIS, doc.to_html(encoding: "SHIFT_JIS").encoding) + + assert_equal(expected_default_encoding, doc.serialize.encoding) + assert_equal(Encoding::UTF_8, doc.serialize(encoding: "UTF-8").encoding) + assert_equal(Encoding::Shift_JIS, doc.serialize(encoding: "SHIFT_JIS").encoding) + + assert_equal( + doc.serialize.bytes, + round_trip_through_file { |io| doc.write_to(io) }.bytes, + ) + assert_equal( + doc.serialize(encoding: "UTF-8").bytes, + round_trip_through_file { |io| doc.write_to(io, encoding: "UTF-8") }.bytes, + ) + assert_equal( + doc.serialize(encoding: "SHIFT_JIS").bytes, + round_trip_through_file { |io| doc.write_to(io, encoding: "SHIFT_JIS") }.bytes, + ) + end + end + end + end + end + end +end From 9dc8c28495599585a6595dbdf07bfd7790ef8b80 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 28 Feb 2023 22:01:13 -0500 Subject: [PATCH 2/4] fix(jruby): write_to enc falls back to document encoding This is what libxml2's `xmlsave.c` functions do, so let's make it explicit in the caller so that JRuby has the same behavior. --- CHANGELOG.md | 1 + lib/nokogiri/xml/node.rb | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d315c2ae1..214a42e5ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Fixed * [JRuby] Serializing an HTML4 document with `#write_to` and specifying no save options will properly emit an HTML document anyway, like libxml2 does. Previously JRuby emitted XML in this situation. +* [JRuby] Serializing with `#write_to` will fall back to the document encoding when no encoding is specified, like libxml2 does. Previously JRuby emitted UTF-8 in this situation. ### Improved diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index e2ca937297..d591ce7720 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -1347,7 +1347,7 @@ def to_xhtml(options = {}) # def write_to(io, *options) options = options.first.is_a?(Hash) ? options.shift : {} - encoding = options[:encoding] || options[0] + encoding = options[:encoding] || options[0] || document.encoding if Nokogiri.jruby? save_options = options[:save_with] || options[1] indent_times = options[:indent] || 0 From 08bc6f905a5aef67208c7613d04a765fe26fa786 Mon Sep 17 00:00:00 2001 From: ellaklara Date: Mon, 27 Feb 2023 11:06:06 +0100 Subject: [PATCH 3/4] fix: support Encoding class for write_to Co-authored-by: Mike Dalessio --- lib/nokogiri/html5/node.rb | 5 +++++ lib/nokogiri/xml/node.rb | 2 ++ test/test_serialization_encoding.rb | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/lib/nokogiri/html5/node.rb b/lib/nokogiri/html5/node.rb index 5edbb36657..b2773868b5 100644 --- a/lib/nokogiri/html5/node.rb +++ b/lib/nokogiri/html5/node.rb @@ -17,6 +17,9 @@ # limitations under the License. # +# +# TODO: this whole file should go away. maybe make it a decorator? +# require_relative "../xml/node" module Nokogiri @@ -50,6 +53,8 @@ def write_to(io, *options) config = XML::Node::SaveOptions.new(save_options.to_i) yield config if block_given? + encoding = encoding.is_a?(Encoding) ? encoding.name : encoding + config_options = config.options if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0 # Use Nokogiri's serializing code. diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index d591ce7720..3dc97b4d8b 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -1365,6 +1365,8 @@ def write_to(io, *options) config = SaveOptions.new(save_options.to_i) yield config if block_given? + encoding = encoding.is_a?(Encoding) ? encoding.name : encoding + native_write_to(io, encoding, indentation, config.options) end diff --git a/test/test_serialization_encoding.rb b/test/test_serialization_encoding.rb index 2a1af503fe..29ed10e013 100644 --- a/test/test_serialization_encoding.rb +++ b/test/test_serialization_encoding.rb @@ -64,18 +64,26 @@ def round_trip_through_file assert_equal(expected_default_encoding, doc.to_xml.encoding) assert_equal(Encoding::UTF_8, doc.to_xml(encoding: "UTF-8").encoding) assert_equal(Encoding::Shift_JIS, doc.to_xml(encoding: "SHIFT_JIS").encoding) + assert_equal(Encoding::UTF_8, doc.to_xml(encoding: Encoding::UTF_8).encoding) + assert_equal(Encoding::Shift_JIS, doc.to_xml(encoding: Encoding::Shift_JIS).encoding) assert_equal(expected_default_encoding, doc.to_xhtml.encoding) assert_equal(Encoding::UTF_8, doc.to_xhtml(encoding: "UTF-8").encoding) assert_equal(Encoding::Shift_JIS, doc.to_xhtml(encoding: "SHIFT_JIS").encoding) + assert_equal(Encoding::UTF_8, doc.to_xhtml(encoding: Encoding::UTF_8).encoding) + assert_equal(Encoding::Shift_JIS, doc.to_xhtml(encoding: Encoding::Shift_JIS).encoding) assert_equal(expected_default_encoding, doc.to_html.encoding) assert_equal(Encoding::UTF_8, doc.to_html(encoding: "UTF-8").encoding) assert_equal(Encoding::Shift_JIS, doc.to_html(encoding: "SHIFT_JIS").encoding) + assert_equal(Encoding::UTF_8, doc.to_html(encoding: Encoding::UTF_8).encoding) + assert_equal(Encoding::Shift_JIS, doc.to_html(encoding: Encoding::Shift_JIS).encoding) assert_equal(expected_default_encoding, doc.serialize.encoding) assert_equal(Encoding::UTF_8, doc.serialize(encoding: "UTF-8").encoding) assert_equal(Encoding::Shift_JIS, doc.serialize(encoding: "SHIFT_JIS").encoding) + assert_equal(Encoding::UTF_8, doc.serialize(encoding: Encoding::UTF_8).encoding) + assert_equal(Encoding::Shift_JIS, doc.serialize(encoding: Encoding::Shift_JIS).encoding) assert_equal( doc.serialize.bytes, @@ -89,6 +97,14 @@ def round_trip_through_file doc.serialize(encoding: "SHIFT_JIS").bytes, round_trip_through_file { |io| doc.write_to(io, encoding: "SHIFT_JIS") }.bytes, ) + assert_equal( + doc.serialize(encoding: "UTF-8").bytes, + round_trip_through_file { |io| doc.write_to(io, encoding: Encoding::UTF_8) }.bytes, + ) + assert_equal( + doc.serialize(encoding: "Shift_JIS").bytes, + round_trip_through_file { |io| doc.write_to(io, encoding: Encoding::Shift_JIS) }.bytes, + ) end end end From 81faf609294e428c5fe9269ea55da26dba9a1ee2 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 6 Mar 2023 17:05:40 -0500 Subject: [PATCH 4/4] doc: update CHANGELOG and Node#write_to docstring --- CHANGELOG.md | 3 +++ lib/nokogiri/xml/node.rb | 31 +++++++++++++++++++------------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 214a42e5ae..ba9a39dbfb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Added +* Serialization methods like `#to_xml`, `#to_html`, `#serialize`, and `#write_to` now accept `Encoding` objects specifying the output encoding. Previously only encoding names (strings) were accepted. [[#2774](https://github.com/sparklemotion/nokogiri/issues/2774), [#2798](https://github.com/sparklemotion/nokogiri/issues/2798)] (Thanks, [@ellaklara](https://github.com/ellaklara)!) + + ### Changed ### Fixed diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index 3dc97b4d8b..d106651e6d 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -1269,11 +1269,11 @@ def <=>(other) # # These two statements are equivalent: # - # node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML) + # node.serialize(encoding: 'UTF-8', save_with: FORMAT | AS_XML) # # or # - # node.serialize(:encoding => 'UTF-8') do |config| + # node.serialize(encoding: 'UTF-8') do |config| # config.format.as_xml # end # @@ -1310,7 +1310,7 @@ def to_html(options = {}) ### # Serialize this Node to XML using +options+ # - # doc.to_xml(:indent => 5, :encoding => 'UTF-8') + # doc.to_xml(indent: 5, encoding: 'UTF-8') # # See Node#write_to for a list of +options+ def to_xml(options = {}) @@ -1321,7 +1321,7 @@ def to_xml(options = {}) ### # Serialize this Node to XHTML using +options+ # - # doc.to_xhtml(:indent => 5, :encoding => 'UTF-8') + # doc.to_xhtml(indent: 5, encoding: 'UTF-8') # # See Node#write_to for a list of +options+ def to_xhtml(options = {}) @@ -1329,21 +1329,28 @@ def to_xhtml(options = {}) end ### - # Write Node to +io+ with +options+. +options+ modify the output of - # this method. Valid options are: + # :call-seq: + # write_to(io, *options) + # + # Serialize this node or document to +io+. + # + # [Parameters] + # - +io+ (IO) An IO-like object to which the serialized content will be written. + # - +options+ (Hash) See below # - # * +:encoding+ for changing the encoding - # * +:indent_text+ the indentation text, defaults to one space - # * +:indent+ the number of +:indent_text+ to use, defaults to 2 - # * +:save_with+ a combination of SaveOptions constants. + # [Options] + # * +:encoding+ (String or Encoding) specify the encoding of the output (defaults to document encoding) + # * +:indent_text+ (String) the indentation text (defaults to " ") + # * +:indent+ (Integer) the number of +:indent_text+ to use (defaults to +2+) + # * +:save_with+ (Integer) a combination of SaveOptions constants # # To save with UTF-8 indented twice: # - # node.write_to(io, :encoding => 'UTF-8', :indent => 2) + # node.write_to(io, encoding: 'UTF-8', indent: 2) # # To save indented with two dashes: # - # node.write_to(io, :indent_text => '-', :indent => 2) + # node.write_to(io, indent_text: '-', indent: 2) # def write_to(io, *options) options = options.first.is_a?(Hash) ? options.shift : {}