Skip to content

Commit

Permalink
Merge pull request #2798 from ellaklara/support-encoding-class-for-wr…
Browse files Browse the repository at this point in the history
…ite_to

fix: support Encoding class for xml write_to
  • Loading branch information
flavorjones authored Mar 7, 2023
2 parents ac832f7 + 81faf60 commit 08c2ad8
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 13 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,15 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

### Added

* Serialization methods like `#to_xml`, `#to_html`, `#serialize`, and `#write_to` now accept `Encoding` objects specifying the output encoding. Previously only encoding names (strings) were accepted. [[#2774](https://github.com/sparklemotion/nokogiri/issues/2774), [#2798](https://github.com/sparklemotion/nokogiri/issues/2798)] (Thanks, [@ellaklara](https://github.com/ellaklara)!)


### Changed

### Fixed

* [JRuby] Serializing an HTML4 document with `#write_to` and specifying no save options will properly emit an HTML document anyway, like libxml2 does. Previously JRuby emitted XML in this situation.
* [JRuby] Serializing with `#write_to` will fall back to the document encoding when no encoding is specified, like libxml2 does. Previously JRuby emitted UTF-8 in this situation.


### Improved
Expand Down
5 changes: 5 additions & 0 deletions lib/nokogiri/html5/node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
# limitations under the License.
#

#
# TODO: this whole file should go away. maybe make it a decorator?
#
require_relative "../xml/node"

module Nokogiri
Expand Down Expand Up @@ -50,6 +53,8 @@ def write_to(io, *options)
config = XML::Node::SaveOptions.new(save_options.to_i)
yield config if block_given?

encoding = encoding.is_a?(Encoding) ? encoding.name : encoding

config_options = config.options
if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0
# Use Nokogiri's serializing code.
Expand Down
35 changes: 22 additions & 13 deletions lib/nokogiri/xml/node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1269,11 +1269,11 @@ def <=>(other)
#
# These two statements are equivalent:
#
# node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
# node.serialize(encoding: 'UTF-8', save_with: FORMAT | AS_XML)
#
# or
#
# node.serialize(:encoding => 'UTF-8') do |config|
# node.serialize(encoding: 'UTF-8') do |config|
# config.format.as_xml
# end
#
Expand Down Expand Up @@ -1310,7 +1310,7 @@ def to_html(options = {})
###
# Serialize this Node to XML using +options+
#
# doc.to_xml(:indent => 5, :encoding => 'UTF-8')
# doc.to_xml(indent: 5, encoding: 'UTF-8')
#
# See Node#write_to for a list of +options+
def to_xml(options = {})
Expand All @@ -1321,33 +1321,40 @@ def to_xml(options = {})
###
# Serialize this Node to XHTML using +options+
#
# doc.to_xhtml(:indent => 5, :encoding => 'UTF-8')
# doc.to_xhtml(indent: 5, encoding: 'UTF-8')
#
# See Node#write_to for a list of +options+
def to_xhtml(options = {})
to_format(SaveOptions::DEFAULT_XHTML, options)
end

###
# Write Node to +io+ with +options+. +options+ modify the output of
# this method. Valid options are:
# :call-seq:
# write_to(io, *options)
#
# Serialize this node or document to +io+.
#
# * +:encoding+ for changing the encoding
# * +:indent_text+ the indentation text, defaults to one space
# * +:indent+ the number of +:indent_text+ to use, defaults to 2
# * +:save_with+ a combination of SaveOptions constants.
# [Parameters]
# - +io+ (IO) An IO-like object to which the serialized content will be written.
# - +options+ (Hash) See below
#
# [Options]
# * +:encoding+ (String or Encoding) specify the encoding of the output (defaults to document encoding)
# * +:indent_text+ (String) the indentation text (defaults to <code>" "</code>)
# * +:indent+ (Integer) the number of +:indent_text+ to use (defaults to +2+)
# * +:save_with+ (Integer) a combination of SaveOptions constants
#
# To save with UTF-8 indented twice:
#
# node.write_to(io, :encoding => 'UTF-8', :indent => 2)
# node.write_to(io, encoding: 'UTF-8', indent: 2)
#
# To save indented with two dashes:
#
# node.write_to(io, :indent_text => '-', :indent => 2)
# node.write_to(io, indent_text: '-', indent: 2)
#
def write_to(io, *options)
options = options.first.is_a?(Hash) ? options.shift : {}
encoding = options[:encoding] || options[0]
encoding = options[:encoding] || options[0] || document.encoding
if Nokogiri.jruby?
save_options = options[:save_with] || options[1]
indent_times = options[:indent] || 0
Expand All @@ -1365,6 +1372,8 @@ def write_to(io, *options)
config = SaveOptions.new(save_options.to_i)
yield config if block_given?

encoding = encoding.is_a?(Encoding) ? encoding.name : encoding

native_write_to(io, encoding, indentation, config.options)
end

Expand Down
114 changes: 114 additions & 0 deletions test/test_serialization_encoding.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# coding: utf-8
# frozen_string_literal: true

require "helper"

class TestSerializationEncoding < Nokogiri::TestCase
def round_trip_through_file
Tempfile.create do |io|
yield io
io.rewind
io.read
end
end

describe "serialization encoding" do
matrix = [
{
klass: Nokogiri::XML::Document,
documents: [
{ encoding: Encoding::UTF_8, path: ADDRESS_XML_FILE },
{ encoding: Encoding::Shift_JIS, path: SHIFT_JIS_XML },
],
},
{
klass: Nokogiri::HTML4::Document,
documents: [
{ encoding: Encoding::UTF_8, path: HTML_FILE },
{ encoding: Encoding::Shift_JIS, path: SHIFT_JIS_HTML },
],
},
]
if Nokogiri.uses_gumbo?
matrix << {
klass: Nokogiri::HTML5::Document,
documents: [
{ encoding: Encoding::UTF_8, path: HTML_FILE },
{ encoding: Encoding::Shift_JIS, path: SHIFT_JIS_HTML },
],
}
end

matrix.each do |matrix_entry|
describe matrix_entry[:klass] do
let(:klass) { matrix_entry[:klass] }
matrix_entry[:documents].each do |document|
describe document[:encoding] do
it "serializes with the expected encoding" do
doc = klass.parse(
File.read(
document[:path],
encoding: document[:encoding],
),
)

expected_default_encoding =
if defined?(Nokogiri::HTML5::Document) && klass == Nokogiri::HTML5::Document
Encoding::UTF_8 # FIXME: see #2801, this should be document[:encoding]
else
document[:encoding]
end

assert_equal(expected_default_encoding, doc.to_s.encoding)

assert_equal(expected_default_encoding, doc.to_xml.encoding)
assert_equal(Encoding::UTF_8, doc.to_xml(encoding: "UTF-8").encoding)
assert_equal(Encoding::Shift_JIS, doc.to_xml(encoding: "SHIFT_JIS").encoding)
assert_equal(Encoding::UTF_8, doc.to_xml(encoding: Encoding::UTF_8).encoding)
assert_equal(Encoding::Shift_JIS, doc.to_xml(encoding: Encoding::Shift_JIS).encoding)

assert_equal(expected_default_encoding, doc.to_xhtml.encoding)
assert_equal(Encoding::UTF_8, doc.to_xhtml(encoding: "UTF-8").encoding)
assert_equal(Encoding::Shift_JIS, doc.to_xhtml(encoding: "SHIFT_JIS").encoding)
assert_equal(Encoding::UTF_8, doc.to_xhtml(encoding: Encoding::UTF_8).encoding)
assert_equal(Encoding::Shift_JIS, doc.to_xhtml(encoding: Encoding::Shift_JIS).encoding)

assert_equal(expected_default_encoding, doc.to_html.encoding)
assert_equal(Encoding::UTF_8, doc.to_html(encoding: "UTF-8").encoding)
assert_equal(Encoding::Shift_JIS, doc.to_html(encoding: "SHIFT_JIS").encoding)
assert_equal(Encoding::UTF_8, doc.to_html(encoding: Encoding::UTF_8).encoding)
assert_equal(Encoding::Shift_JIS, doc.to_html(encoding: Encoding::Shift_JIS).encoding)

assert_equal(expected_default_encoding, doc.serialize.encoding)
assert_equal(Encoding::UTF_8, doc.serialize(encoding: "UTF-8").encoding)
assert_equal(Encoding::Shift_JIS, doc.serialize(encoding: "SHIFT_JIS").encoding)
assert_equal(Encoding::UTF_8, doc.serialize(encoding: Encoding::UTF_8).encoding)
assert_equal(Encoding::Shift_JIS, doc.serialize(encoding: Encoding::Shift_JIS).encoding)

assert_equal(
doc.serialize.bytes,
round_trip_through_file { |io| doc.write_to(io) }.bytes,
)
assert_equal(
doc.serialize(encoding: "UTF-8").bytes,
round_trip_through_file { |io| doc.write_to(io, encoding: "UTF-8") }.bytes,
)
assert_equal(
doc.serialize(encoding: "SHIFT_JIS").bytes,
round_trip_through_file { |io| doc.write_to(io, encoding: "SHIFT_JIS") }.bytes,
)
assert_equal(
doc.serialize(encoding: "UTF-8").bytes,
round_trip_through_file { |io| doc.write_to(io, encoding: Encoding::UTF_8) }.bytes,
)
assert_equal(
doc.serialize(encoding: "Shift_JIS").bytes,
round_trip_through_file { |io| doc.write_to(io, encoding: Encoding::Shift_JIS) }.bytes,
)
end
end
end
end
end
end
end

0 comments on commit 08c2ad8

Please sign in to comment.