Refactor to avoid repeated boolean checks

ruby · Jan 16, 2025 · 0133cbc · 0133cbc
1 parent 70aadd0
commit 0133cbc
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 209 deletions.
diff --git a/java/src/json/ext/Generator.java b/java/src/json/ext/Generator.java
@@ -185,7 +185,9 @@ public byte[] getCharBytes() {
         public StringEncoder getStringEncoder(ThreadContext context) {
             if (stringEncoder == null) {
                 GeneratorState state = getState(context);
-                stringEncoder = new StringEncoder(state.asciiOnly(), state.scriptSafe());
+                stringEncoder = state.asciiOnly() ?
+                        new StringEncoderAsciiOnly(state.scriptSafe()) :
+                        new StringEncoder(state.scriptSafe());
             }
             return stringEncoder;
         }

diff --git a/java/src/json/ext/StringEncoder.java b/java/src/json/ext/StringEncoder.java
@@ -9,6 +9,7 @@
 import org.jcodings.specific.ASCIIEncoding;
 import org.jcodings.specific.USASCIIEncoding;
 import org.jcodings.specific.UTF8Encoding;
+import org.jruby.Ruby;
 import org.jruby.RubyException;
 import org.jruby.RubyString;
 import org.jruby.exceptions.RaiseException;
@@ -25,8 +26,8 @@
  * to another ByteList. The source string is fully checked for UTF-8 validity,
  * and throws a GeneratorError if any problem is found.
  */
-final class StringEncoder extends ByteListTranscoder {
-    private static final int CHAR_LENGTH_MASK = 7;
+class StringEncoder extends ByteListTranscoder {
+    protected static final int CHAR_LENGTH_MASK = 7;
     private static final byte[] BACKSLASH_DOUBLEQUOTE = {'\\', '"'};
     private static final byte[] BACKSLASH_BACKSLASH = {'\\', '\\'};
     private static final byte[] BACKSLASH_FORWARDSLASH = {'\\', '/'};
@@ -111,39 +112,30 @@ final class StringEncoder extends ByteListTranscoder {
     private static final byte[] BACKSLASH_U2028 = "\\u2028".getBytes(StandardCharsets.US_ASCII);
     private static final byte[] BACKSLASH_U2029 = "\\u2029".getBytes(StandardCharsets.US_ASCII);
 
-    private final boolean asciiOnly, scriptSafe;
-    private final byte[] escapeTable;
+    protected final byte[] escapeTable;
 
     OutputStream out;
 
     // Escaped characters will reuse this array, to avoid new allocations
     // or appending them byte-by-byte
-    private final byte[] aux =
+    protected final byte[] aux =
         new byte[] {/* First Unicode character */
                     '\\', 'u', 0, 0, 0, 0,
                     /* Second unicode character (for surrogate pairs) */
                     '\\', 'u', 0, 0, 0, 0,
                     /* "\X" characters */
                     '\\', 0};
-    // offsets on the array above
-    private static final int ESCAPE_UNI1_OFFSET = 0;
-    private static final int ESCAPE_UNI2_OFFSET = ESCAPE_UNI1_OFFSET + 6;
-    private static final int ESCAPE_CHAR_OFFSET = ESCAPE_UNI2_OFFSET + 6;
-    /** Array used for code point decomposition in surrogates */
-    private final char[] utf16 = new char[2];
-
-    private static final byte[] HEX =
+
+    protected static final byte[] HEX =
             new byte[] {'0', '1', '2', '3', '4', '5', '6', '7',
                         '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
 
-    StringEncoder(boolean asciiOnly, boolean scriptSafe) {
-        this.asciiOnly = asciiOnly;
-        this.scriptSafe = scriptSafe;
-        if (asciiOnly) {
-            escapeTable = scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ASCII_ONLY_ESCAPE_TABLE;
-        } else {
-            escapeTable = scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ESCAPE_TABLE;
-        }
+    StringEncoder(boolean scriptSafe) {
+        this(scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ESCAPE_TABLE);
+    }
+
+    StringEncoder(byte[] escapeTable) {
+        this.escapeTable = escapeTable;
     }
 
     // C: generate_json_string
@@ -157,11 +149,7 @@ void generate(ThreadContext context, RubyString object, OutputStream buffer) thr
         switch (object.scanForCodeRange()) {
             case StringSupport.CR_7BIT:
             case StringSupport.CR_VALID:
-                if (asciiOnly) {
-                    encodeASCII(byteList);
-                } else {
-                    encode(byteList);
-                }
+                encode(byteList);
                 break;
             default:
                 throw Utils.buildGeneratorError(context, object, "source sequence is illegal/malformed utf-8").toThrowable();
@@ -181,23 +169,26 @@ static RubyString ensureValidEncoding(ThreadContext context, RubyString str) {
     }
 
     private static RubyString tryWeirdEncodings(ThreadContext context, RubyString str, Encoding encoding) {
+        Ruby runtime = context.runtime;
+
         RubyString utf8String;
+
         if (encoding == ASCIIEncoding.INSTANCE) {
-            utf8String = str.strDup(context.runtime);
+            utf8String = str.strDup(runtime);
             utf8String.setEncoding(UTF8Encoding.INSTANCE);
             switch (utf8String.getCodeRange()) {
                 case StringSupport.CR_7BIT:
                     return utf8String;
                 case StringSupport.CR_VALID:
                     // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
                     // TODO: Raise in 3.0.0
-                    context.runtime.getWarnings().warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
+                    runtime.getWarnings().warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
                     return utf8String;
             }
         }
 
         try {
-            str = (RubyString) str.encode(context, context.runtime.getEncodingService().convertEncodingToRubyEncoding(UTF8Encoding.INSTANCE));
+            str = (RubyString) str.encode(context, runtime.getEncodingService().convertEncodingToRubyEncoding(UTF8Encoding.INSTANCE));
         } catch (RaiseException re) {
             RubyException exc = Utils.buildGeneratorError(context, str, re.getMessage());
             exc.setCause(re.getException());
@@ -229,24 +220,7 @@ void encode(ByteList src) throws IOException {
                 switch (ch_len) {
                     case 9: {
                         beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1);
-                        switch (ch) {
-                            case '"':  appendEscape(BACKSLASH_DOUBLEQUOTE); break;
-                            case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
-                            case '/':  appendEscape(BACKSLASH_FORWARDSLASH); break;
-                            case '\b': appendEscape(BACKSLASH_B); break;
-                            case '\f': appendEscape(BACKSLASH_F); break;
-                            case '\n': appendEscape(BACKSLASH_N); break;
-                            case '\r': appendEscape(BACKSLASH_R); break;
-                            case '\t': appendEscape(BACKSLASH_T); break;
-                            default: {
-                                scratch[2] = '0';
-                                scratch[3] = '0';
-                                scratch[4] = hexdig[(ch >> 4) & 0xf];
-                                scratch[5] = hexdig[ch & 0xf];
-                                append(scratch, 0, 6);
-                                break;
-                            }
-                        }
+                        escapeAscii(ch, scratch, hexdig);
                         break;
                     }
                     case 11: {
@@ -280,110 +254,30 @@ void encode(ByteList src) throws IOException {
         }
     }
 
-    private int flushPos(int pos, int beg, byte[] ptrBytes, int ptr, int size) throws IOException {
+    protected int flushPos(int pos, int beg, byte[] ptrBytes, int ptr, int size) throws IOException {
         if (pos > beg) { append(ptrBytes, ptr + beg, pos - beg); }
         return pos + size;
     }
 
-    // C: convert_UTF8_to_ASCII_only_JSON
-    void encodeASCII(ByteList src) throws IOException {
-        byte[] hexdig = HEX;
-        byte[] scratch = aux;
-        byte[] escapeTable = this.escapeTable;
-
-        byte[] ptrBytes = src.unsafeBytes();
-        int ptr = src.begin();
-        int len = src.realSize();
-
-        int beg = 0;
-        int pos = 0;
-
-        while (pos < len) {
-            int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]);
-            int ch_len = escapeTable[ch];
-
-            if (ch_len != 0) {
-                switch (ch_len) {
-                    case 9: {
-                        beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1);
-                        switch (ch) {
-                            case '"':  appendEscape(BACKSLASH_DOUBLEQUOTE); break;
-                            case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
-                            case '/':  appendEscape(BACKSLASH_FORWARDSLASH); break;
-                            case '\b': appendEscape(BACKSLASH_B); break;
-                            case '\f': appendEscape(BACKSLASH_F); break;
-                            case '\n': appendEscape(BACKSLASH_N); break;
-                            case '\r': appendEscape(BACKSLASH_R); break;
-                            case '\t': appendEscape(BACKSLASH_T); break;
-                            default: {
-                                scratch[2] = '0';
-                                scratch[3] = '0';
-                                scratch[4] = hexdig[(ch >> 4) & 0xf];
-                                scratch[5] = hexdig[ch & 0xf];
-                                append(scratch, 0, 6);
-                                break;
-                            }
-                        }
-                        break;
-                    }
-                    default: {
-                        int wchar = 0;
-                        ch_len = ch_len & CHAR_LENGTH_MASK;
-
-                        switch(ch_len) {
-                            case 2:
-                                wchar = ptrBytes[ptr + pos] & 0x1F;
-                                break;
-                            case 3:
-                                wchar = ptrBytes[ptr + pos] & 0x0F;
-                                break;
-                            case 4:
-                                wchar = ptrBytes[ptr + pos] & CHAR_LENGTH_MASK;
-                                break;
-                        }
-
-                        for (short i = 1; i < ch_len; i++) {
-                            wchar = (wchar << 6) | (ptrBytes[ptr + pos +i] & 0x3F);
-                        }
-
-                        beg = pos = flushPos(pos, beg, ptrBytes, ptr, ch_len);
-
-                        if (wchar <= 0xFFFF) {
-                            scratch[2] = hexdig[wchar >> 12];
-                            scratch[3] = hexdig[(wchar >> 8) & 0xf];
-                            scratch[4] = hexdig[(wchar >> 4) & 0xf];
-                            scratch[5] = hexdig[wchar & 0xf];
-                            append(scratch, 0, 6);
-                        } else {
-                            int hi, lo;
-                            wchar -= 0x10000;
-                            hi = 0xD800 + (wchar >> 10);
-                            lo = 0xDC00 + (wchar & 0x3FF);
-
-                            scratch[2] = hexdig[hi >> 12];
-                            scratch[3] = hexdig[(hi >> 8) & 0xf];
-                            scratch[4] = hexdig[(hi >> 4) & 0xf];
-                            scratch[5] = hexdig[hi & 0xf];
-
-                            scratch[8] = hexdig[lo >> 12];
-                            scratch[9] = hexdig[(lo >> 8) & 0xf];
-                            scratch[10] = hexdig[(lo >> 4) & 0xf];
-                            scratch[11] = hexdig[lo & 0xf];
-
-                            append(scratch, 0, 12);
-                        }
-
-                        break;
-                    }
-                }
-            } else {
-                pos++;
+    protected void escapeAscii(int ch, byte[] scratch, byte[] hexdig) throws IOException {
+        switch (ch) {
+            case '"':  appendEscape(BACKSLASH_DOUBLEQUOTE); break;
+            case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
+            case '/':  appendEscape(BACKSLASH_FORWARDSLASH); break;
+            case '\b': appendEscape(BACKSLASH_B); break;
+            case '\f': appendEscape(BACKSLASH_F); break;
+            case '\n': appendEscape(BACKSLASH_N); break;
+            case '\r': appendEscape(BACKSLASH_R); break;
+            case '\t': appendEscape(BACKSLASH_T); break;
+            default: {
+                scratch[2] = '0';
+                scratch[3] = '0';
+                scratch[4] = hexdig[(ch >> 4) & 0xf];
+                scratch[5] = hexdig[ch & 0xf];
+                append(scratch, 0, 6);
+                break;
             }
         }
-
-        if (beg < len) {
-            append(ptrBytes, ptr + beg, len - beg);
-        }
     }
 
     private void appendEscape(byte[] escape) throws IOException {
@@ -398,69 +292,6 @@ protected void append(byte[] origin, int start, int length) throws IOException {
         out.write(origin, start, length);
     }
 
-    private void handleChar(int c) throws IOException {
-        switch (c) {
-        case '"':
-        case '\\':
-            escapeChar((char)c);
-            break;
-        case '\n':
-            escapeChar('n');
-            break;
-        case '\r':
-            escapeChar('r');
-            break;
-        case '\t':
-            escapeChar('t');
-            break;
-        case '\f':
-            escapeChar('f');
-            break;
-        case '\b':
-            escapeChar('b');
-            break;
-        case '/':
-            if(scriptSafe) {
-                escapeChar((char)c);
-                break;
-            }
-        case 0x2028:
-        case 0x2029:
-            if (scriptSafe) {
-                quoteStop(charStart);
-                escapeUtf8Char(c);
-                break;
-            }
-        default:
-            if (c >= 0x20 && c <= 0x7f ||
-                    (c >= 0x80 && !asciiOnly)) {
-                quoteStart();
-            } else {
-                quoteStop(charStart);
-                escapeUtf8Char(c);
-            }
-        }
-    }
-
-    private void escapeChar(char c) throws IOException {
-        quoteStop(charStart);
-        aux[ESCAPE_CHAR_OFFSET + 1] = (byte)c;
-        append(aux, ESCAPE_CHAR_OFFSET, 2);
-    }
-
-    private void escapeUtf8Char(int codePoint) throws IOException {
-        int numChars = Character.toChars(codePoint, utf16, 0);
-        escapeCodeUnit(utf16[0], ESCAPE_UNI1_OFFSET + 2);
-        if (numChars > 1) escapeCodeUnit(utf16[1], ESCAPE_UNI2_OFFSET + 2);
-        append(aux, ESCAPE_UNI1_OFFSET, 6 * numChars);
-    }
-
-    private void escapeCodeUnit(char c, int auxOffset) {
-        for (int i = 0; i < 4; i++) {
-            aux[auxOffset + i] = HEX[(c >>> (12 - 4 * i)) & 0xf];
-        }
-    }
-
     @Override
     protected RaiseException invalidUtf8(ThreadContext context) {
         return Utils.newException(context, Utils.M_GENERATOR_ERROR, "source sequence is illegal/malformed utf-8");