From 7c8d21b119db0746426119dbf6b4f3c0780f4736 Mon Sep 17 00:00:00 2001 From: "luk.ho" Date: Wed, 3 Apr 2024 13:17:29 -0500 Subject: [PATCH] Bug 36477854 - [36336179->14.1.2.0.0] Performance Degradation for UTF encoded strings After Upgrading .Net Client to 12.2.1.4, coherence-net-v14.1.1.0-core [git-p4: depot-paths = "//dev/release.net/coherence-net-v14.1.1.0-core/": change = 108233] --- src/Coherence.Core/IO/DataWriter.cs | 265 +----------------- src/Coherence.Core/IO/Pof/PofHelper.cs | 47 +--- .../IO/Pof/WritingPofHandler.cs | 11 +- .../Util/SerializationHelper.cs | 29 +- .../IO/IndentingWriterTests.cs | 27 +- .../IO/Pof/PofStreamPrimitiveArrayTests.cs | 6 +- .../IO/Pof/PofStreamReaderAndWriterTests.cs | 70 ++++- 7 files changed, 125 insertions(+), 330 deletions(-) diff --git a/src/Coherence.Core/IO/DataWriter.cs b/src/Coherence.Core/IO/DataWriter.cs index df07452..8a9ebfe 100644 --- a/src/Coherence.Core/IO/DataWriter.cs +++ b/src/Coherence.Core/IO/DataWriter.cs @@ -1,9 +1,10 @@ /* - * Copyright (c) 2000, 2020, Oracle and/or its affiliates. + * Copyright (c) 2000, 2024, Oracle and/or its affiliates. * * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ + using System; using System.IO; using System.Text; @@ -48,31 +49,6 @@ public DataWriter(Stream output) : base(output) #endregion - #region Properties - - /// - /// Obtain a temp buffer used to avoid allocations from - /// repeated calls to String APIs. - /// - /// - /// a char buffer of CHAR_BUF_SIZE characters long - /// - protected char[] CharBuf - { - get - { - // "partial" (i.e. windowed) char buffer just for formatUTF - char[] ach = m_achBuf; - if (ach == null) - { - m_achBuf = ach = new char[CHAR_BUF_SIZE]; - } - return ach; - } - } - - #endregion - #region Packed format writing /// @@ -403,247 +379,12 @@ public override void Write(string text) } else { - byte[] bytes = FormatUTF(text); + byte[] bytes = Encoding.UTF8.GetBytes(text); WritePackedInt32(bytes.Length); Write(bytes); } } #endregion - - #region UTF encoding functions - - /// - /// Figure out how many bytes it will take to hold the passed String. - /// - /// - /// This method is tightly bound to formatUTF. - /// - /// - /// the String - /// - /// - /// the binary UTF length - /// - protected int CalcUTF(String s) - { - int cch = s.Length; - int cb = cch; - char[] ach = CharBuf; - bool fSmall = (cch <= CHAR_BUF_SIZE); - if (fSmall) - { - var src = new StringBuilder(s); - src.CopyTo(0, ach, 0, cch); - } - - for (int ofch = 0; ofch < cch; ++ofch) - { - int ch; - if (fSmall) - { - ch = ach[ofch]; - } - else - { - int ofBuf = ofch & CHAR_BUF_MASK; - if (ofBuf == 0) - { - var src = new StringBuilder(s); - int len = Math.Min(ofch + CHAR_BUF_SIZE, cch) - ofch; - src.CopyTo(ofch, ach, 0, len); - } - ch = ach[ofBuf]; - } - - if (ch <= 0x007F) - { - // all bytes in this range use the 1-byte format - // except for 0 - if (ch == 0) - { - ++cb; - } - } - else - { - // either a 2-byte format or a 3-byte format (if over - // 0x07FF) - cb += (ch <= 0x07FF ? 1 : 2); - } - } - - return cb; - } - - /// - /// Format the passed String as UTF into the passed byte array. - /// - /// - /// This method is tightly bound to calcUTF. - /// - /// - /// the string. - /// - /// - /// The formated UTF byte array. - /// - public byte[] FormatUTF(String s) - { - int cch = s.Length; - int cb = CalcUTF(s); - int ofb = 0; - byte[] ab = new byte[cb]; - - if (cb == cch) - { - // ask the string to convert itself to ascii bytes - // straight into the WriteBuffer - Encoding.ASCII.GetBytes(s, 0, cch, ab, ofb); - } - else - { - char[] ach = CharBuf; - if (cch <= CHAR_BUF_SIZE) - { - // The following is unnecessary, because it would already - // have been performed by calcUTF: - // - // if (fSmall) - // { - // s.getChars(0, cch, ach, 0); - // } - FormatUTF(ab, ofb, ach, cch); - } - else - { - for (int ofch = 0; ofch < cch; ofch += CHAR_BUF_SIZE) - { - int cchChunk = Math.Min(CHAR_BUF_SIZE, cch - ofch); - StringBuilder src = new StringBuilder(s); - src.CopyTo(ofch, ach, 0, cchChunk); - ofb += FormatUTF(ab, ofb, ach, cchChunk); - } - } - } - - return ab; - } - - /// - /// Format the passed characters as UTF into the passed byte array. - /// - /// - /// The byte array to format into. - /// - /// - /// The offset into the byte array to write the first byte. - /// - /// - /// The array of characters to format. - /// - /// - /// The number of characters to format. - /// - /// - /// The number of bytes written to the array. - /// - protected int FormatUTF(byte[] ab, int ofb, char[] ach, int cch) - { - int ofbOrig = ofb; - for (int ofch = 0; ofch < cch; ++ofch) - { - char ch = ach[ofch]; - if (ch >= 0x0001 && ch <= 0x007F) - { - // 1-byte format: 0xxx xxxx - ab[ofb++] = (byte) ch; - } - else if (ch <= 0x07FF) - { - // 2-byte format: 110x xxxx, 10xx xxxx - ab[ofb++] = (byte) (0xC0 | ((ch >> 6) & 0x1F)); - ab[ofb++] = (byte) (0x80 | ((ch ) & 0x3F)); - } - else - { - // 3-byte format: 1110 xxxx, 10xx xxxx, 10xx xxxx - ab[ofb++] = (byte) (0xE0 | ((ch >> 12) & 0x0F)); - ab[ofb++] = (byte) (0x80 | ((ch >> 6) & 0x3F)); - ab[ofb++] = (byte) (0x80 | ((ch ) & 0x3F)); - } - } - return ofb - ofbOrig; - } - - /// - /// Get a buffer for formating data to bytes. Note that the resulting buffer - /// may be shorter than the requested size. - /// - /// - /// the requested size for the buffer - /// - /// - /// A byte array that is at least cb bytes long, but not - /// shorter than and (regardless of the value of - /// cb) not longer than . - /// - protected byte[] Tmpbuf(int cb) - { - byte[] ab = m_abBuf; - if (ab == null || ab.Length < cb) - { - int cbOld = ab == null ? 0 : ab.Length; - int cbNew = Math.Max(MIN_BUF, Math.Min(MAX_BUF , cb)); - if (cbNew > cbOld) - { - m_abBuf = ab = new byte[cbNew > ((uint) MAX_BUF >> 1) ? MAX_BUF : cbNew]; - } - } - return ab; - } - - #endregion - - #region Data Members - - /// - /// The minimum size of the temp buffer. - /// - private const int MIN_BUF = 0x40; - - /// - /// The maximum size of the temp buffer. The maximum size must be at least - /// (3 * CHAR_BUF_SIZE) to accomodate the worst-case UTF - /// formatting length. - /// - private const int MAX_BUF = 0x400; - - /// - /// Size of the temporary character buffer. Must be a power of 2. - /// Size is: 256 characters (.25 KB). - /// - protected const int CHAR_BUF_SIZE = 0x100; - - /// - /// Bitmask used against a raw offset to determine the offset within - /// the temporary character buffer. - /// - protected const int CHAR_BUF_MASK = (CHAR_BUF_SIZE - 1); - - /// - /// A temp buffer to use for building the data to write. - /// - [NonSerialized] - private byte[] m_abBuf; - - /// - /// A lazily instantiated temp buffer used to avoid allocations from - /// and repeated calls to String functions. - /// - [NonSerialized] - protected char[] m_achBuf; - - #endregion } } \ No newline at end of file diff --git a/src/Coherence.Core/IO/Pof/PofHelper.cs b/src/Coherence.Core/IO/Pof/PofHelper.cs index a81fffe..58aa553 100644 --- a/src/Coherence.Core/IO/Pof/PofHelper.cs +++ b/src/Coherence.Core/IO/Pof/PofHelper.cs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2022, Oracle and/or its affiliates. + * Copyright (c) 2000, 2024, Oracle and/or its affiliates. * * Licensed under the Universal Permissive License v 1.0 as shown at * https://oss.oracle.com/licenses/upl. @@ -313,51 +313,6 @@ public static int DecodeTinyInt(int n) /// public static char ReadChar(DataReader reader) { - // int ch = reader.PeekChar(); - // if (ch == 65533) // Unicode replacement character - // { - // ch = reader.ReadByte(); - // int ch1 = ch & 0xFF; - // switch ((ch1 & 0xF0) >> 4) - // { - // case 0xC: - // case 0xD: - // { - // // 2-byte format: 110x xxxx, 10xx xxxx - // int ch2 = reader.ReadByte() & 0xFF; - // if ((ch2 & 0xC0) != 0x80) - // { - // throw new ArgumentException( - // "illegal leading UTF byte: " + ch2); - // } - // ch = (char)(((ch1 & 0x1F) << 6) | ch2 & 0x3F); - // break; - // } - // - // case 0xE: - // { - // // 3-byte format: 1110 xxxx, 10xx xxxx, 10xx xxxx - // int ch2 = reader.ReadByte() & 0xFF; - // int ch3 = reader.ReadByte() & 0xFF; - // if ((ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) - // { - // throw new ArgumentException( - // "illegal leading UTF bytes: " + ch2 + ", " + ch3); - // } - // ch = (char)(((ch & 0x0F) << 12) | - // ((ch2 & 0x3F) << 6) | - // ((ch3 & 0x3F))); - // break; - // } - // - // default: - // throw new ArgumentException( - // "illegal leading UTF byte: " + ch); - // } - // - // return (char) ch; - // } - return reader.ReadChar(); } diff --git a/src/Coherence.Core/IO/Pof/WritingPofHandler.cs b/src/Coherence.Core/IO/Pof/WritingPofHandler.cs index 8890a05..40e3985 100644 --- a/src/Coherence.Core/IO/Pof/WritingPofHandler.cs +++ b/src/Coherence.Core/IO/Pof/WritingPofHandler.cs @@ -6,7 +6,7 @@ */ using System; using System.Diagnostics; -using System.Text; + using Tangosol.Util; namespace Tangosol.IO.Pof @@ -744,14 +744,7 @@ public virtual void OnChar(int position, char ch) writer.WritePackedInt32(PofConstants.T_CHAR); } - if (Char.IsSurrogate(ch)) - { - writer.Write(writer.FormatUTF(ch.ToString())); - } - else - { - writer.Write(ch); - } + writer.Write(ch); } } diff --git a/src/Coherence.Core/Util/SerializationHelper.cs b/src/Coherence.Core/Util/SerializationHelper.cs index d82f496..9649a43 100644 --- a/src/Coherence.Core/Util/SerializationHelper.cs +++ b/src/Coherence.Core/Util/SerializationHelper.cs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2020, Oracle and/or its affiliates. + * Copyright (c) 2000, 2024, Oracle and/or its affiliates. * * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. @@ -145,7 +145,7 @@ public static String ConvertUTF(byte[] ab, int of, int cb) } else { - ach[ofch++] = (char)n; + ach[ofch++] = (char) n; } } @@ -198,6 +198,31 @@ public static String ConvertUTF(byte[] ab, int of, int cb) break; } + case 0xF: + { + // 4-byte format: 1111 0xxx, 10xx xxxx, 10xx xxxx, 10xx xxxx (supplemental plane) + int ch2 = ab[++ofAsc] & 0xFF; + int ch3 = ab[++ofAsc] & 0xFF; + int ch4 = ab[++ofAsc] & 0xFF; + if ((ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) + { + throw new IOException( + "illegal leading UTF bytes: " + ch2 + ", " + ch3 + ", " + ch4); + } + int cp = ((ch & 0x07) << 18) | + ((ch2 & 0x3F) << 12) | + ((ch3 & 0x3F) << 6) | + (ch4 & 0x3F); + + cp = cp - 0x10000; + char high = (char) (0xD800 + ((cp >> 10) & 0x3FF)); + char low = (char) (0xDC00 + (cp & 0x3FF)); + + ach[ofch++] = high; + ach[ofch++] = low; + break; + } + default: throw new IOException( "illegal leading UTF byte: " + ch); diff --git a/tests/Coherence.Core.Tests/IO/IndentingWriterTests.cs b/tests/Coherence.Core.Tests/IO/IndentingWriterTests.cs index ea341c3..c8a3932 100644 --- a/tests/Coherence.Core.Tests/IO/IndentingWriterTests.cs +++ b/tests/Coherence.Core.Tests/IO/IndentingWriterTests.cs @@ -37,12 +37,33 @@ public void TestIndentingWriter() indWriter.Write(nl); indWriter.Write(line2, 1, 5); indWriter.Write(line3); - Assert.AreEqual("line1\nline2\nline3", strWriter.ToString()); + if (strWriter.ToString().Contains('\r')) + { + Assert.AreEqual("line1\nline2\nline3\r", strWriter.ToString()); + } + else + { + Assert.AreEqual("line1\nline2\nline3", strWriter.ToString()); + } indWriter.Resume(); indWriter.WriteLine(); - Assert.AreEqual("line1\nline2\nline3\n", strWriter.ToString()); + if (strWriter.ToString().Contains('\r')) + { + Assert.AreEqual("line1\nline2\nline3\r\n", strWriter.ToString()); + } + else + { + Assert.AreEqual("line1\nline2\nline3\n", strWriter.ToString()); + } indWriter.Write(line1); - Assert.AreEqual("line1\nline2\nline3\n line1", strWriter.ToString()); + if (strWriter.ToString().Contains('\r')) + { + Assert.AreEqual("line1\nline2\nline3\r\n line1", strWriter.ToString()); + } + else + { + Assert.AreEqual("line1\nline2\nline3\n line1", strWriter.ToString()); + } strWriter = new StringWriter(); indWriter = new IndentingWriter(strWriter, "ZZZ"); diff --git a/tests/Coherence.Core.Tests/IO/Pof/PofStreamPrimitiveArrayTests.cs b/tests/Coherence.Core.Tests/IO/Pof/PofStreamPrimitiveArrayTests.cs index 4146442..7a6394c 100644 --- a/tests/Coherence.Core.Tests/IO/Pof/PofStreamPrimitiveArrayTests.cs +++ b/tests/Coherence.Core.Tests/IO/Pof/PofStreamPrimitiveArrayTests.cs @@ -147,7 +147,7 @@ public void TestCharArray() // Create a character array with multi-bytes character. string gkNumber = Char.ConvertFromUtf32(0x10154); - char[] chars = new[] { 'z', 'a', '\u0306', '\u01FD', '\u03B2', gkNumber[0], gkNumber[1] }; + char[] chars = new char[] {'z', 'a', '\u0306', '\u01FD', '\u03B2'}; // Create a string with multi-bytes character. String multiStr = "abc" + Char.ConvertFromUtf32(Int32.Parse("2A601", NumberStyles.HexNumber)) + "def"; @@ -158,7 +158,7 @@ public void TestCharArray() pofWriter.WriteArray(0, objArray); pofWriter.WriteCollection(0, al); pofWriter.WriteString(0, str); - //pofWriter.WriteCharArray(0, chars); + pofWriter.WriteCharArray(0, chars); pofWriter.WriteString(0, multiStr); initPOFReader(); @@ -169,7 +169,7 @@ public void TestCharArray() Assert.AreEqual(al.ToArray(), pofReader.ReadCharArray(0)); Assert.AreEqual(str.ToCharArray(), pofReader.ReadCharArray(0)); // TODO: re-enable this test - //Assert.AreEqual(chars, pofReader.ReadCharArray(0)); + Assert.AreEqual(chars, pofReader.ReadCharArray(0)); Assert.AreEqual(multiStr, pofReader.ReadString(0)); } diff --git a/tests/Coherence.Core.Tests/IO/Pof/PofStreamReaderAndWriterTests.cs b/tests/Coherence.Core.Tests/IO/Pof/PofStreamReaderAndWriterTests.cs index 2335d99..b264728 100644 --- a/tests/Coherence.Core.Tests/IO/Pof/PofStreamReaderAndWriterTests.cs +++ b/tests/Coherence.Core.Tests/IO/Pof/PofStreamReaderAndWriterTests.cs @@ -5,13 +5,12 @@ * http://oss.oracle.com/licenses/upl. */ using System; -using System.Collections.Generic; using System.Globalization; using System.IO; +using System.Text; using NUnit.Framework; using Tangosol.Util; -using IList=System.Collections.IList; namespace Tangosol.IO.Pof { @@ -91,9 +90,7 @@ public void TestUTF8Serialization() String surrogate = "abc" + Char.ConvertFromUtf32(Int32.Parse("2A601", NumberStyles.HexNumber)) + "def"; // Safe UTF-8 encoding & decoding of string. - Stream stream = new MemoryStream(); - DataWriter writer = new DataWriter(stream); - byte[] bytes = writer.FormatUTF(surrogate); + byte[] bytes = Encoding.UTF8.GetBytes(surrogate); Console.WriteLine("Safe UTF-8-encoded code units:"); foreach (var utf8Byte in bytes) @@ -102,5 +99,68 @@ public void TestUTF8Serialization() string s = SerializationHelper.ConvertUTF(bytes, 0, bytes.Length); Assert.AreEqual(s, surrogate); } + + /// + /// Additional UTF-8 conversion tests. + /// + /// Coherence 14.1.1.15 + [Test] + public void TestUtfConversion() + { + AssertUtfConversion("Aleksandar"); + AssertUtfConversion("Александар"); + AssertUtfConversion("ⅯⅭⅯⅬⅩⅩⅠⅤ"); + + uint[] aInt = new uint[] { 0xf0938080, 0xf09f8ebf, 0xf09f8f80, 0xf09f8e89, 0xf09f9294 }; + byte[] aByte = ToBytes(aInt); + AssertUtfConversion(aByte); + + // make sure we can still handle our proprietary (broken) encoding + String sUtf = Encoding.UTF8.GetString(aByte); + ISerializer serializer = new SimplePofContext(); + Binary bin = SerializationHelper.ToBinary(sUtf, serializer); + + Assert.AreEqual(23, bin.Length); + Assert.AreEqual(sUtf, SerializationHelper.FromBinary(bin, serializer)); + } + + #region helper methods + + private void AssertUtfConversion(String s) + { + AssertUtfConversion(Encoding.UTF8.GetBytes(s)); + } + + private void AssertUtfConversion(byte[] abUtf8) + { + String sExpected = Encoding.UTF8.GetString(abUtf8); + String sActual = SerializationHelper.ConvertUTF(abUtf8, 0, abUtf8.Length); + Console.Write("\n%12s = %-12s : utf8 bytes = %d; string length = %d", sExpected, sActual, abUtf8.Length, sActual.Length); + Assert.AreEqual(sExpected, sActual); + } + + private static byte[] ToBytes(uint[] ai) + { + byte[] abResult = new byte[ai.Length * 4]; + int i = 0; + + foreach (uint n in ai) + { + MemoryStream buf = new MemoryStream(); + BinaryWriter writer = new BinaryWriter(buf); + + writer.Write(n); // this writes in little endian (.NET default), expects big endian (Java default) + byte[] ab = buf.ToArray(); + abResult[i + 3] = ab[0]; + abResult[i + 2] = ab[1]; + abResult[i + 1] = ab[2]; + abResult[i] = ab[3]; + i += 4; + } + + return abResult; + } + #endregion + } }