Skip to content

Commit

Permalink
Refactor to avoid repeated boolean checks
Browse files Browse the repository at this point in the history
  • Loading branch information
headius committed Jan 16, 2025
1 parent 70aadd0 commit 0133cbc
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 209 deletions.
4 changes: 3 additions & 1 deletion java/src/json/ext/Generator.java
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ public byte[] getCharBytes() {
public StringEncoder getStringEncoder(ThreadContext context) {
if (stringEncoder == null) {
GeneratorState state = getState(context);
stringEncoder = new StringEncoder(state.asciiOnly(), state.scriptSafe());
stringEncoder = state.asciiOnly() ?
new StringEncoderAsciiOnly(state.scriptSafe()) :
new StringEncoder(state.scriptSafe());
}
return stringEncoder;
}
Expand Down
247 changes: 39 additions & 208 deletions java/src/json/ext/StringEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.Ruby;
import org.jruby.RubyException;
import org.jruby.RubyString;
import org.jruby.exceptions.RaiseException;
Expand All @@ -25,8 +26,8 @@
* to another ByteList. The source string is fully checked for UTF-8 validity,
* and throws a GeneratorError if any problem is found.
*/
final class StringEncoder extends ByteListTranscoder {
private static final int CHAR_LENGTH_MASK = 7;
class StringEncoder extends ByteListTranscoder {
protected static final int CHAR_LENGTH_MASK = 7;
private static final byte[] BACKSLASH_DOUBLEQUOTE = {'\\', '"'};
private static final byte[] BACKSLASH_BACKSLASH = {'\\', '\\'};
private static final byte[] BACKSLASH_FORWARDSLASH = {'\\', '/'};
Expand Down Expand Up @@ -111,39 +112,30 @@ final class StringEncoder extends ByteListTranscoder {
private static final byte[] BACKSLASH_U2028 = "\\u2028".getBytes(StandardCharsets.US_ASCII);
private static final byte[] BACKSLASH_U2029 = "\\u2029".getBytes(StandardCharsets.US_ASCII);

private final boolean asciiOnly, scriptSafe;
private final byte[] escapeTable;
protected final byte[] escapeTable;

OutputStream out;

// Escaped characters will reuse this array, to avoid new allocations
// or appending them byte-by-byte
private final byte[] aux =
protected final byte[] aux =
new byte[] {/* First Unicode character */
'\\', 'u', 0, 0, 0, 0,
/* Second unicode character (for surrogate pairs) */
'\\', 'u', 0, 0, 0, 0,
/* "\X" characters */
'\\', 0};
// offsets on the array above
private static final int ESCAPE_UNI1_OFFSET = 0;
private static final int ESCAPE_UNI2_OFFSET = ESCAPE_UNI1_OFFSET + 6;
private static final int ESCAPE_CHAR_OFFSET = ESCAPE_UNI2_OFFSET + 6;
/** Array used for code point decomposition in surrogates */
private final char[] utf16 = new char[2];

private static final byte[] HEX =

protected static final byte[] HEX =
new byte[] {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};

StringEncoder(boolean asciiOnly, boolean scriptSafe) {
this.asciiOnly = asciiOnly;
this.scriptSafe = scriptSafe;
if (asciiOnly) {
escapeTable = scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ASCII_ONLY_ESCAPE_TABLE;
} else {
escapeTable = scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ESCAPE_TABLE;
}
StringEncoder(boolean scriptSafe) {
this(scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ESCAPE_TABLE);
}

StringEncoder(byte[] escapeTable) {
this.escapeTable = escapeTable;
}

// C: generate_json_string
Expand All @@ -157,11 +149,7 @@ void generate(ThreadContext context, RubyString object, OutputStream buffer) thr
switch (object.scanForCodeRange()) {
case StringSupport.CR_7BIT:
case StringSupport.CR_VALID:
if (asciiOnly) {
encodeASCII(byteList);
} else {
encode(byteList);
}
encode(byteList);
break;
default:
throw Utils.buildGeneratorError(context, object, "source sequence is illegal/malformed utf-8").toThrowable();
Expand All @@ -181,23 +169,26 @@ static RubyString ensureValidEncoding(ThreadContext context, RubyString str) {
}

private static RubyString tryWeirdEncodings(ThreadContext context, RubyString str, Encoding encoding) {
Ruby runtime = context.runtime;

RubyString utf8String;

if (encoding == ASCIIEncoding.INSTANCE) {
utf8String = str.strDup(context.runtime);
utf8String = str.strDup(runtime);
utf8String.setEncoding(UTF8Encoding.INSTANCE);
switch (utf8String.getCodeRange()) {
case StringSupport.CR_7BIT:
return utf8String;
case StringSupport.CR_VALID:
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Raise in 3.0.0
context.runtime.getWarnings().warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
runtime.getWarnings().warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
return utf8String;
}
}

try {
str = (RubyString) str.encode(context, context.runtime.getEncodingService().convertEncodingToRubyEncoding(UTF8Encoding.INSTANCE));
str = (RubyString) str.encode(context, runtime.getEncodingService().convertEncodingToRubyEncoding(UTF8Encoding.INSTANCE));
} catch (RaiseException re) {
RubyException exc = Utils.buildGeneratorError(context, str, re.getMessage());
exc.setCause(re.getException());
Expand Down Expand Up @@ -229,24 +220,7 @@ void encode(ByteList src) throws IOException {
switch (ch_len) {
case 9: {
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1);
switch (ch) {
case '"': appendEscape(BACKSLASH_DOUBLEQUOTE); break;
case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
case '/': appendEscape(BACKSLASH_FORWARDSLASH); break;
case '\b': appendEscape(BACKSLASH_B); break;
case '\f': appendEscape(BACKSLASH_F); break;
case '\n': appendEscape(BACKSLASH_N); break;
case '\r': appendEscape(BACKSLASH_R); break;
case '\t': appendEscape(BACKSLASH_T); break;
default: {
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
append(scratch, 0, 6);
break;
}
}
escapeAscii(ch, scratch, hexdig);
break;
}
case 11: {
Expand Down Expand Up @@ -280,110 +254,30 @@ void encode(ByteList src) throws IOException {
}
}

private int flushPos(int pos, int beg, byte[] ptrBytes, int ptr, int size) throws IOException {
protected int flushPos(int pos, int beg, byte[] ptrBytes, int ptr, int size) throws IOException {
if (pos > beg) { append(ptrBytes, ptr + beg, pos - beg); }
return pos + size;
}

// C: convert_UTF8_to_ASCII_only_JSON
void encodeASCII(ByteList src) throws IOException {
byte[] hexdig = HEX;
byte[] scratch = aux;
byte[] escapeTable = this.escapeTable;

byte[] ptrBytes = src.unsafeBytes();
int ptr = src.begin();
int len = src.realSize();

int beg = 0;
int pos = 0;

while (pos < len) {
int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]);
int ch_len = escapeTable[ch];

if (ch_len != 0) {
switch (ch_len) {
case 9: {
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1);
switch (ch) {
case '"': appendEscape(BACKSLASH_DOUBLEQUOTE); break;
case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
case '/': appendEscape(BACKSLASH_FORWARDSLASH); break;
case '\b': appendEscape(BACKSLASH_B); break;
case '\f': appendEscape(BACKSLASH_F); break;
case '\n': appendEscape(BACKSLASH_N); break;
case '\r': appendEscape(BACKSLASH_R); break;
case '\t': appendEscape(BACKSLASH_T); break;
default: {
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
append(scratch, 0, 6);
break;
}
}
break;
}
default: {
int wchar = 0;
ch_len = ch_len & CHAR_LENGTH_MASK;

switch(ch_len) {
case 2:
wchar = ptrBytes[ptr + pos] & 0x1F;
break;
case 3:
wchar = ptrBytes[ptr + pos] & 0x0F;
break;
case 4:
wchar = ptrBytes[ptr + pos] & CHAR_LENGTH_MASK;
break;
}

for (short i = 1; i < ch_len; i++) {
wchar = (wchar << 6) | (ptrBytes[ptr + pos +i] & 0x3F);
}

beg = pos = flushPos(pos, beg, ptrBytes, ptr, ch_len);

if (wchar <= 0xFFFF) {
scratch[2] = hexdig[wchar >> 12];
scratch[3] = hexdig[(wchar >> 8) & 0xf];
scratch[4] = hexdig[(wchar >> 4) & 0xf];
scratch[5] = hexdig[wchar & 0xf];
append(scratch, 0, 6);
} else {
int hi, lo;
wchar -= 0x10000;
hi = 0xD800 + (wchar >> 10);
lo = 0xDC00 + (wchar & 0x3FF);

scratch[2] = hexdig[hi >> 12];
scratch[3] = hexdig[(hi >> 8) & 0xf];
scratch[4] = hexdig[(hi >> 4) & 0xf];
scratch[5] = hexdig[hi & 0xf];

scratch[8] = hexdig[lo >> 12];
scratch[9] = hexdig[(lo >> 8) & 0xf];
scratch[10] = hexdig[(lo >> 4) & 0xf];
scratch[11] = hexdig[lo & 0xf];

append(scratch, 0, 12);
}

break;
}
}
} else {
pos++;
protected void escapeAscii(int ch, byte[] scratch, byte[] hexdig) throws IOException {
switch (ch) {
case '"': appendEscape(BACKSLASH_DOUBLEQUOTE); break;
case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
case '/': appendEscape(BACKSLASH_FORWARDSLASH); break;
case '\b': appendEscape(BACKSLASH_B); break;
case '\f': appendEscape(BACKSLASH_F); break;
case '\n': appendEscape(BACKSLASH_N); break;
case '\r': appendEscape(BACKSLASH_R); break;
case '\t': appendEscape(BACKSLASH_T); break;
default: {
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
append(scratch, 0, 6);
break;
}
}

if (beg < len) {
append(ptrBytes, ptr + beg, len - beg);
}
}

private void appendEscape(byte[] escape) throws IOException {
Expand All @@ -398,69 +292,6 @@ protected void append(byte[] origin, int start, int length) throws IOException {
out.write(origin, start, length);
}

private void handleChar(int c) throws IOException {
switch (c) {
case '"':
case '\\':
escapeChar((char)c);
break;
case '\n':
escapeChar('n');
break;
case '\r':
escapeChar('r');
break;
case '\t':
escapeChar('t');
break;
case '\f':
escapeChar('f');
break;
case '\b':
escapeChar('b');
break;
case '/':
if(scriptSafe) {
escapeChar((char)c);
break;
}
case 0x2028:
case 0x2029:
if (scriptSafe) {
quoteStop(charStart);
escapeUtf8Char(c);
break;
}
default:
if (c >= 0x20 && c <= 0x7f ||
(c >= 0x80 && !asciiOnly)) {
quoteStart();
} else {
quoteStop(charStart);
escapeUtf8Char(c);
}
}
}

private void escapeChar(char c) throws IOException {
quoteStop(charStart);
aux[ESCAPE_CHAR_OFFSET + 1] = (byte)c;
append(aux, ESCAPE_CHAR_OFFSET, 2);
}

private void escapeUtf8Char(int codePoint) throws IOException {
int numChars = Character.toChars(codePoint, utf16, 0);
escapeCodeUnit(utf16[0], ESCAPE_UNI1_OFFSET + 2);
if (numChars > 1) escapeCodeUnit(utf16[1], ESCAPE_UNI2_OFFSET + 2);
append(aux, ESCAPE_UNI1_OFFSET, 6 * numChars);
}

private void escapeCodeUnit(char c, int auxOffset) {
for (int i = 0; i < 4; i++) {
aux[auxOffset + i] = HEX[(c >>> (12 - 4 * i)) & 0xf];
}
}

@Override
protected RaiseException invalidUtf8(ThreadContext context) {
return Utils.newException(context, Utils.M_GENERATOR_ERROR, "source sequence is illegal/malformed utf-8");
Expand Down
Loading

0 comments on commit 0133cbc

Please sign in to comment.