From 7c8d21b119db0746426119dbf6b4f3c0780f4736 Mon Sep 17 00:00:00 2001
From: "luk.ho" <luk.ho@oracle.com>
Date: Wed, 3 Apr 2024 13:17:29 -0500
Subject: [PATCH] Bug 36477854 - [36336179->14.1.2.0.0] Performance Degradation
 for UTF encoded strings After Upgrading .Net Client to 12.2.1.4,
 coherence-net-v14.1.1.0-core

[git-p4: depot-paths = "//dev/release.net/coherence-net-v14.1.1.0-core/": change = 108233]
---
 src/Coherence.Core/IO/DataWriter.cs           | 265 +-----------------
 src/Coherence.Core/IO/Pof/PofHelper.cs        |  47 +---
 .../IO/Pof/WritingPofHandler.cs               |  11 +-
 .../Util/SerializationHelper.cs               |  29 +-
 .../IO/IndentingWriterTests.cs                |  27 +-
 .../IO/Pof/PofStreamPrimitiveArrayTests.cs    |   6 +-
 .../IO/Pof/PofStreamReaderAndWriterTests.cs   |  70 ++++-
 7 files changed, 125 insertions(+), 330 deletions(-)
diff --git a/src/Coherence.Core/IO/DataWriter.cs b/src/Coherence.Core/IO/DataWriter.cs
index df07452..8a9ebfe 100644
--- a/src/Coherence.Core/IO/DataWriter.cs
+++ b/src/Coherence.Core/IO/DataWriter.cs
@@ -1,9 +1,10 @@
 /*
- * Copyright (c) 2000, 2020, Oracle and/or its affiliates.
+ * Copyright (c) 2000, 2024, Oracle and/or its affiliates.
  *
  * Licensed under the Universal Permissive License v 1.0 as shown at
  * http://oss.oracle.com/licenses/upl.
  */
+
 using System;
 using System.IO;
 using System.Text;
@@ -48,31 +49,6 @@ public DataWriter(Stream output) : base(output)
 
         #endregion
 
-        #region Properties
-
-        /// <summary>
-        /// Obtain a temp buffer used to avoid allocations from
-        /// repeated calls to String APIs.
-        /// </summary>
-        /// <return>
-        /// a char buffer of CHAR_BUF_SIZE characters long
-        /// </return>
-        protected char[] CharBuf
-        {
-            get
-            {
-                // "partial" (i.e. windowed) char buffer just for formatUTF
-                char[] ach = m_achBuf;
-                if (ach == null)
-                {
-                    m_achBuf = ach = new char[CHAR_BUF_SIZE];
-                }
-                return ach;
-            }
-        }
-
-        #endregion
-
         #region Packed format writing
 
         /// <summary>
@@ -403,247 +379,12 @@ public override void Write(string text)
             }
             else
             {
-                byte[] bytes = FormatUTF(text);
+                byte[] bytes = Encoding.UTF8.GetBytes(text);
                 WritePackedInt32(bytes.Length);
                 Write(bytes);
             }
         }
 
         #endregion
-
-        #region UTF encoding functions
-
-        /// <summary>
-        /// Figure out how many bytes it will take to hold the passed String.
-        /// </summary>
-        /// <remarks>
-        /// This method is tightly bound to formatUTF.
-        /// </remarks>
-        /// <param  name="s">
-        /// the String
-        /// </param>
-        /// <return>
-        /// the binary UTF length
-        /// </return>
-        protected int CalcUTF(String s)
-        {
-            int    cch    = s.Length;
-            int    cb     = cch;
-            char[] ach    = CharBuf;
-            bool   fSmall = (cch <= CHAR_BUF_SIZE);
-            if (fSmall)
-            {
-                var src = new StringBuilder(s);
-                src.CopyTo(0, ach, 0, cch);
-            }
-
-            for (int ofch = 0; ofch < cch; ++ofch)
-            {
-                int ch;
-                if (fSmall)
-                {
-                    ch = ach[ofch];
-                }
-                else
-                {
-                    int ofBuf = ofch & CHAR_BUF_MASK;
-                    if (ofBuf == 0)
-                    {
-                        var src = new StringBuilder(s);
-                        int len = Math.Min(ofch + CHAR_BUF_SIZE, cch) - ofch;
-                        src.CopyTo(ofch, ach, 0, len);
-                    }
-                    ch = ach[ofBuf];
-                }
-
-                if (ch <= 0x007F)
-                {
-                    // all bytes in this range use the 1-byte format
-                    // except for 0
-                    if (ch == 0)
-                    {
-                        ++cb;
-                    }
-                }
-                else
-                {
-                    // either a 2-byte format or a 3-byte format (if over
-                    // 0x07FF)
-                    cb += (ch <= 0x07FF ? 1 : 2);
-                }
-            }
-
-            return cb;
-       }
-
-        /// <summary>
-        /// Format the passed String as UTF into the passed byte array.
-        /// </summary>
-        /// <remarks>
-        /// This method is tightly bound to calcUTF.
-        /// </remarks>
-        /// <param name="s">
-        /// the string.
-        /// </param>
-        /// <returns>
-        /// The formated UTF byte array.
-        /// </returns>
-        public byte[] FormatUTF(String s)
-        {
-            int    cch = s.Length;
-            int    cb  = CalcUTF(s);
-            int    ofb = 0;
-            byte[] ab  = new byte[cb];
-
-            if (cb == cch)
-            {
-                // ask the string to convert itself to ascii bytes
-                // straight into the WriteBuffer                
-                Encoding.ASCII.GetBytes(s, 0, cch, ab, ofb);
-            }
-            else
-            {
-                char[]  ach = CharBuf;
-                if (cch <= CHAR_BUF_SIZE)
-                {
-                    // The following is unnecessary, because it would already
-                    // have been performed by calcUTF:
-                    //
-                    //   if (fSmall)
-                    //       {
-                    //       s.getChars(0, cch, ach, 0);
-                    //       }
-                    FormatUTF(ab, ofb, ach, cch);
-                }
-                else
-                {
-                    for (int ofch = 0; ofch < cch; ofch += CHAR_BUF_SIZE)
-                    {
-                        int cchChunk = Math.Min(CHAR_BUF_SIZE, cch - ofch);
-                        StringBuilder src = new StringBuilder(s);
-                        src.CopyTo(ofch, ach, 0, cchChunk);
-                        ofb += FormatUTF(ab, ofb, ach, cchChunk);
-                    }
-                }
-            }
-
-            return ab;
-        }
-
-        /// <summary>
-        /// Format the passed characters as UTF into the passed byte array.
-        /// </summary>
-        /// <param name="ab">
-        /// The byte array to format into.
-        /// </param>
-        /// <param name="ofb">
-        /// The offset into the byte array to write the first byte.
-        /// </param>
-        /// <param name="ach">
-        /// The array of characters to format.
-        /// </param>
-        /// <param name="cch">
-        /// The number of characters to format.
-        /// </param>
-        /// <return>
-        /// The number of bytes written to the array.
-        /// </return>
-        protected int FormatUTF(byte[] ab, int ofb, char[] ach, int cch)
-        {
-            int ofbOrig = ofb;
-            for (int ofch = 0; ofch < cch; ++ofch)
-            {
-                char ch = ach[ofch];
-                if (ch >= 0x0001 && ch <= 0x007F)
-                {
-                    // 1-byte format:  0xxx xxxx
-                    ab[ofb++] = (byte) ch;
-                }
-                else if (ch <= 0x07FF)
-                {
-                    // 2-byte format:  110x xxxx, 10xx xxxx
-                    ab[ofb++] = (byte) (0xC0 | ((ch >> 6) & 0x1F));
-                    ab[ofb++] = (byte) (0x80 | ((ch     ) & 0x3F));
-                }
-                else
-                {
-                    // 3-byte format:  1110 xxxx, 10xx xxxx, 10xx xxxx
-                    ab[ofb++] = (byte) (0xE0 | ((ch >> 12) & 0x0F));
-                    ab[ofb++] = (byte) (0x80 | ((ch >>  6) & 0x3F));
-                    ab[ofb++] = (byte) (0x80 | ((ch      ) & 0x3F));
-               }
-            }
-            return ofb - ofbOrig;
-        }
-
-        ///<summary>
-        /// Get a buffer for formating data to bytes. Note that the resulting buffer
-        /// may be shorter than the requested size.
-        /// </summary>
-        /// <param  name="cb">
-        /// the requested size for the buffer
-        /// </param>
-        /// <return>
-        /// A byte array that is at least <tt>cb</tt> bytes long, but not
-        /// shorter than <see cref="MIN_BUF"/> and (regardless of the value of
-        /// <tt>cb</tt>) not longer than <see cref="MAX_BUF"/>.
-        /// </return>
-        protected byte[] Tmpbuf(int cb)
-        {
-            byte[] ab = m_abBuf;
-            if (ab == null || ab.Length < cb)
-            {
-                int cbOld = ab == null ? 0 : ab.Length;
-                int cbNew = Math.Max(MIN_BUF, Math.Min(MAX_BUF , cb));
-                if (cbNew > cbOld)
-                {
-                    m_abBuf = ab = new byte[cbNew > ((uint) MAX_BUF >> 1) ? MAX_BUF : cbNew];
-                }
-            }
-            return ab;
-        }
-
-        #endregion
-
-        #region Data Members
-
-        /// <summary>
-        /// The minimum size of the temp buffer.
-        /// </summary>
-        private const int MIN_BUF = 0x40;
-
-        /// <summary>
-        /// The maximum size of the temp buffer. The maximum size must be at least
-        /// <tt>(3 * CHAR_BUF_SIZE)</tt> to accomodate the worst-case UTF
-        /// formatting length.
-        /// </summary>
-        private const int MAX_BUF = 0x400;
-
-        /// <summary>
-        /// Size of the temporary character buffer. Must be a power of 2.
-        /// Size is: 256 characters (.25 KB).        
-        /// </summary> 
-        protected const int CHAR_BUF_SIZE = 0x100;
-
-        /// <summary>
-        /// Bitmask used against a raw offset to determine the offset within
-        /// the temporary character buffer.
-        /// </summary>
-        protected const int CHAR_BUF_MASK = (CHAR_BUF_SIZE - 1);
-
-        /// <summary>
-        /// A temp buffer to use for building the data to write.
-        /// </summary>
-        [NonSerialized]
-        private byte[] m_abBuf;
-
-        /// <summary>
-        /// A lazily instantiated temp buffer used to avoid allocations from
-        /// and repeated calls to String functions.
-        /// </summary>
-        [NonSerialized]
-        protected char[] m_achBuf;
-
-        #endregion
     }
 }
\ No newline at end of file
diff --git a/src/Coherence.Core/IO/Pof/PofHelper.cs b/src/Coherence.Core/IO/Pof/PofHelper.cs
index a81fffe..58aa553 100644
--- a/src/Coherence.Core/IO/Pof/PofHelper.cs
+++ b/src/Coherence.Core/IO/Pof/PofHelper.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2022, Oracle and/or its affiliates.
+ * Copyright (c) 2000, 2024, Oracle and/or its affiliates.
  *
  * Licensed under the Universal Permissive License v 1.0 as shown at
  * https://oss.oracle.com/licenses/upl.
@@ -313,51 +313,6 @@ public static int DecodeTinyInt(int n)
         /// </returns>
         public static char ReadChar(DataReader reader)
         {
-            // int ch = reader.PeekChar();
-            // if (ch == 65533)    // Unicode replacement character
-            // {
-            //     ch = reader.ReadByte();
-            //     int ch1 = ch & 0xFF;
-            //     switch ((ch1 & 0xF0) >> 4)
-            //     {
-            //         case 0xC:
-            //         case 0xD:
-            //             {
-            //                 // 2-byte format:  110x xxxx, 10xx xxxx
-            //                 int ch2 = reader.ReadByte() & 0xFF;
-            //                 if ((ch2 & 0xC0) != 0x80)
-            //                 {
-            //                     throw new ArgumentException(
-            //                             "illegal leading UTF byte: " + ch2);
-            //                 }
-            //                 ch = (char)(((ch1 & 0x1F) << 6) | ch2 & 0x3F);
-            //                 break;
-            //             }
-            //
-            //         case 0xE:
-            //             {
-            //                 // 3-byte format:  1110 xxxx, 10xx xxxx, 10xx xxxx
-            //                 int ch2 = reader.ReadByte() & 0xFF;
-            //                 int ch3 = reader.ReadByte() & 0xFF;
-            //                 if ((ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80)
-            //                 {
-            //                     throw new ArgumentException(
-            //                             "illegal leading UTF bytes: " + ch2 + ", " + ch3);
-            //                 }
-            //                 ch = (char)(((ch & 0x0F) << 12) |
-            //                             ((ch2 & 0x3F) << 6) |
-            //                             ((ch3 & 0x3F)));
-            //                 break;
-            //             }
-            //
-            //         default:
-            //             throw new ArgumentException(
-            //                     "illegal leading UTF byte: " + ch);
-            //     }
-            //
-            //     return (char) ch;
-            // }
-
             return reader.ReadChar();
         }
 
diff --git a/src/Coherence.Core/IO/Pof/WritingPofHandler.cs b/src/Coherence.Core/IO/Pof/WritingPofHandler.cs
index 8890a05..40e3985 100644
--- a/src/Coherence.Core/IO/Pof/WritingPofHandler.cs
+++ b/src/Coherence.Core/IO/Pof/WritingPofHandler.cs
@@ -6,7 +6,7 @@
  */
 using System;
 using System.Diagnostics;
-using System.Text;
+
 using Tangosol.Util;
 
 namespace Tangosol.IO.Pof
@@ -744,14 +744,7 @@ public virtual void OnChar(int position, char ch)
                     writer.WritePackedInt32(PofConstants.T_CHAR);
                 }
 
-                if (Char.IsSurrogate(ch))
-                {
-                    writer.Write(writer.FormatUTF(ch.ToString()));
-                }
-                else
-                {
-                    writer.Write(ch);
-                }
+                writer.Write(ch);
             }
         }
 
diff --git a/src/Coherence.Core/Util/SerializationHelper.cs b/src/Coherence.Core/Util/SerializationHelper.cs
index d82f496..9649a43 100644
--- a/src/Coherence.Core/Util/SerializationHelper.cs
+++ b/src/Coherence.Core/Util/SerializationHelper.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2020, Oracle and/or its affiliates.
+ * Copyright (c) 2000, 2024, Oracle and/or its affiliates.
  *
  * Licensed under the Universal Permissive License v 1.0 as shown at
  * http://oss.oracle.com/licenses/upl.
@@ -145,7 +145,7 @@ public static String ConvertUTF(byte[] ab, int of, int cb)
                 }
                 else
                 {
-                    ach[ofch++] = (char)n;
+                    ach[ofch++] = (char) n;
                 }
             }
 
@@ -198,6 +198,31 @@ public static String ConvertUTF(byte[] ab, int of, int cb)
                                 break;
                             }
 
+                        case 0xF:
+                            {
+                                // 4-byte format:  1111 0xxx, 10xx xxxx, 10xx xxxx, 10xx xxxx (supplemental plane)
+                                int ch2 = ab[++ofAsc] & 0xFF;
+                                int ch3 = ab[++ofAsc] & 0xFF;
+                                int ch4 = ab[++ofAsc] & 0xFF;
+                                if ((ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80)
+                                {
+                                    throw new IOException(
+                                        "illegal leading UTF bytes: " + ch2 + ", " + ch3 + ", " + ch4);
+                                }
+                                int cp = ((ch & 0x07) << 18) |
+                                          ((ch2 & 0x3F) << 12) |
+                                          ((ch3 & 0x3F) << 6) |
+                                          (ch4 & 0x3F);
+                              
+                                cp = cp - 0x10000;
+                                char high = (char) (0xD800 + ((cp >> 10) & 0x3FF));
+                                char low  = (char) (0xDC00 + (cp & 0x3FF));
+
+                                ach[ofch++] = high;
+                                ach[ofch++] = low;
+                                break;
+                            }
+
                         default:
                             throw new IOException(
                                 "illegal leading UTF byte: " + ch);
diff --git a/tests/Coherence.Core.Tests/IO/IndentingWriterTests.cs b/tests/Coherence.Core.Tests/IO/IndentingWriterTests.cs
index ea341c3..c8a3932 100644
--- a/tests/Coherence.Core.Tests/IO/IndentingWriterTests.cs
+++ b/tests/Coherence.Core.Tests/IO/IndentingWriterTests.cs
@@ -37,12 +37,33 @@ public void TestIndentingWriter()
             indWriter.Write(nl);
             indWriter.Write(line2, 1, 5);
             indWriter.Write(line3);
-            Assert.AreEqual("line1\nline2\nline3", strWriter.ToString());
+            if (strWriter.ToString().Contains('\r'))
+            {
+                Assert.AreEqual("line1\nline2\nline3\r", strWriter.ToString());
+            }
+            else 
+            {
+                Assert.AreEqual("line1\nline2\nline3", strWriter.ToString());
+            }
             indWriter.Resume();
             indWriter.WriteLine();
-            Assert.AreEqual("line1\nline2\nline3\n", strWriter.ToString());
+            if (strWriter.ToString().Contains('\r'))
+            {
+                Assert.AreEqual("line1\nline2\nline3\r\n", strWriter.ToString());
+            }
+            else
+            {
+                Assert.AreEqual("line1\nline2\nline3\n", strWriter.ToString());
+            }
             indWriter.Write(line1);
-            Assert.AreEqual("line1\nline2\nline3\n    line1", strWriter.ToString());
+            if (strWriter.ToString().Contains('\r'))
+            {
+                Assert.AreEqual("line1\nline2\nline3\r\n    line1", strWriter.ToString());
+            }
+            else
+            {
+                Assert.AreEqual("line1\nline2\nline3\n    line1", strWriter.ToString());
+            }
 
             strWriter = new StringWriter();
             indWriter = new IndentingWriter(strWriter, "ZZZ");
diff --git a/tests/Coherence.Core.Tests/IO/Pof/PofStreamPrimitiveArrayTests.cs b/tests/Coherence.Core.Tests/IO/Pof/PofStreamPrimitiveArrayTests.cs
index 4146442..7a6394c 100644
--- a/tests/Coherence.Core.Tests/IO/Pof/PofStreamPrimitiveArrayTests.cs
+++ b/tests/Coherence.Core.Tests/IO/Pof/PofStreamPrimitiveArrayTests.cs
@@ -147,7 +147,7 @@ public void TestCharArray()
 
             // Create a character array with multi-bytes character.
             string gkNumber = Char.ConvertFromUtf32(0x10154);
-            char[] chars    = new[] { 'z', 'a', '\u0306', '\u01FD', '\u03B2', gkNumber[0], gkNumber[1] };
+            char[] chars    = new char[] {'z', 'a', '\u0306', '\u01FD', '\u03B2'};
             // Create a string with multi-bytes character.
             String multiStr = "abc" + Char.ConvertFromUtf32(Int32.Parse("2A601", NumberStyles.HexNumber)) + "def";
 
@@ -158,7 +158,7 @@ public void TestCharArray()
             pofWriter.WriteArray(0, objArray);
             pofWriter.WriteCollection(0, al);
             pofWriter.WriteString(0, str);
-            //pofWriter.WriteCharArray(0, chars);
+            pofWriter.WriteCharArray(0, chars);
             pofWriter.WriteString(0, multiStr);
 
             initPOFReader();
@@ -169,7 +169,7 @@ public void TestCharArray()
             Assert.AreEqual(al.ToArray(), pofReader.ReadCharArray(0));
             Assert.AreEqual(str.ToCharArray(), pofReader.ReadCharArray(0));
             // TODO: re-enable this test
-            //Assert.AreEqual(chars, pofReader.ReadCharArray(0));
+            Assert.AreEqual(chars, pofReader.ReadCharArray(0));
             Assert.AreEqual(multiStr, pofReader.ReadString(0));
         }
 
diff --git a/tests/Coherence.Core.Tests/IO/Pof/PofStreamReaderAndWriterTests.cs b/tests/Coherence.Core.Tests/IO/Pof/PofStreamReaderAndWriterTests.cs
index 2335d99..b264728 100644
--- a/tests/Coherence.Core.Tests/IO/Pof/PofStreamReaderAndWriterTests.cs
+++ b/tests/Coherence.Core.Tests/IO/Pof/PofStreamReaderAndWriterTests.cs
@@ -5,13 +5,12 @@
  * http://oss.oracle.com/licenses/upl.
  */
 using System;
-using System.Collections.Generic;
 using System.Globalization;
 using System.IO;
+using System.Text;
 
 using NUnit.Framework;
 using Tangosol.Util;
-using IList=System.Collections.IList;
 
 namespace Tangosol.IO.Pof
 {
@@ -91,9 +90,7 @@ public void TestUTF8Serialization()
             String surrogate = "abc" + Char.ConvertFromUtf32(Int32.Parse("2A601", NumberStyles.HexNumber)) + "def";
 
             // Safe UTF-8 encoding & decoding of string.
-            Stream     stream = new MemoryStream();
-            DataWriter writer = new DataWriter(stream);
-            byte[]     bytes  = writer.FormatUTF(surrogate);
+            byte[] bytes = Encoding.UTF8.GetBytes(surrogate);
 
             Console.WriteLine("Safe UTF-8-encoded code units:");
             foreach (var utf8Byte in bytes)
@@ -102,5 +99,68 @@ public void TestUTF8Serialization()
             string s = SerializationHelper.ConvertUTF(bytes, 0, bytes.Length);
             Assert.AreEqual(s, surrogate);
         }
+
+        /// <summary>
+        /// Additional UTF-8 conversion tests.
+        /// </summary>
+        /// <since>Coherence 14.1.1.15</since>
+        [Test]
+        public void TestUtfConversion()
+        {
+            AssertUtfConversion("Aleksandar");
+            AssertUtfConversion("Александар");
+            AssertUtfConversion("ⅯⅭⅯⅬⅩⅩⅠⅤ");
+
+            uint[] aInt  = new uint[] { 0xf0938080, 0xf09f8ebf, 0xf09f8f80, 0xf09f8e89, 0xf09f9294 };
+            byte[] aByte = ToBytes(aInt);
+            AssertUtfConversion(aByte);
+
+            // make sure we can still handle our proprietary (broken) encoding
+            String      sUtf       = Encoding.UTF8.GetString(aByte);
+            ISerializer serializer = new SimplePofContext();
+            Binary      bin        = SerializationHelper.ToBinary(sUtf, serializer);
+
+            Assert.AreEqual(23, bin.Length);
+            Assert.AreEqual(sUtf, SerializationHelper.FromBinary(bin, serializer));
+        }
+
+        #region helper methods
+
+        private void AssertUtfConversion(String s)
+        {
+            AssertUtfConversion(Encoding.UTF8.GetBytes(s));
+        }
+
+        private void AssertUtfConversion(byte[] abUtf8)
+        {
+            String sExpected = Encoding.UTF8.GetString(abUtf8);
+            String sActual   = SerializationHelper.ConvertUTF(abUtf8, 0, abUtf8.Length);
+            Console.Write("\n%12s = %-12s : utf8 bytes = %d; string length = %d", sExpected, sActual, abUtf8.Length, sActual.Length);
+            Assert.AreEqual(sExpected, sActual);
+        }
+
+        private static byte[] ToBytes(uint[] ai)
+        {
+            byte[] abResult = new byte[ai.Length * 4];
+            int    i        = 0;
+
+            foreach (uint n in ai)
+            {
+                MemoryStream buf    = new MemoryStream();
+                BinaryWriter writer = new BinaryWriter(buf);
+
+                writer.Write(n);            // this writes in little endian (.NET default), expects big endian (Java default)
+                byte[] ab = buf.ToArray();
+                abResult[i + 3] = ab[0];
+                abResult[i + 2] = ab[1];
+                abResult[i + 1] = ab[2];
+                abResult[i]     = ab[3];
+                i += 4;
+            }
+
+            return abResult;
+        }
+        #endregion
+
     }
 }