Skip to content

Commit

Permalink
Refactor convert_UTF8_to_JSON to split searching and escaping code
Browse files Browse the repository at this point in the history
The goal is to be able to dispatch to more optimized search implementations
without having to duplicate the escaping code.

Somehow, this is a few % faster already:

```
== Encoding activitypub.json (52595 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after     2.257k i/100ms
Calculating -------------------------------------
               after     22.930k (± 1.3%) i/s   (43.61 μs/i) -    115.107k in   5.020814s

Comparison:
              before:    21604.0 i/s
               after:    22930.1 i/s - 1.06x  faster

== Encoding citm_catalog.json (500298 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after   137.000 i/100ms
Calculating -------------------------------------
               after      1.397k (± 1.1%) i/s  (715.57 μs/i) -      6.987k in   5.000408s

Comparison:
              before:     1344.4 i/s
               after:     1397.5 i/s - 1.04x  faster

== Encoding twitter.json (466906 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after   249.000 i/100ms
Calculating -------------------------------------
               after      2.464k (± 1.8%) i/s  (405.81 μs/i) -     12.450k in   5.054131s

Comparison:
              before:     2326.5 i/s
               after:     2464.2 i/s - 1.06x  faster
```
  • Loading branch information
byroot committed Jan 31, 2025
1 parent 1023227 commit 99a3aea
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 69 deletions.
157 changes: 88 additions & 69 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
// 0 - single byte char that don't need to be escaped.
// (x | 8) - char that needs to be escaped.
static const unsigned char CHAR_LENGTH_MASK = 7;
static const unsigned char ESCAPE_MASK = 8;

static const unsigned char escape_table[256] = {
// ASCII Control Characters
Expand Down Expand Up @@ -165,6 +166,84 @@ static const unsigned char script_safe_escape_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};


typedef struct _search_state {
const char *ptr;
const char *end;
const char *cursor;
FBuffer *buffer;
} search_state;

static inline void escape_UTF8_char(search_state *state, unsigned char ch_len) {
const unsigned char ch = (unsigned char)*state->ptr;
switch (ch_len) {
case 1: {
switch (ch) {
case '"': fbuffer_append(state->buffer, "\\\"", 2); break;
case '\\': fbuffer_append(state->buffer, "\\\\", 2); break;
case '/': fbuffer_append(state->buffer, "\\/", 2); break;
case '\b': fbuffer_append(state->buffer, "\\b", 2); break;
case '\f': fbuffer_append(state->buffer, "\\f", 2); break;
case '\n': fbuffer_append(state->buffer, "\\n", 2); break;
case '\r': fbuffer_append(state->buffer, "\\r", 2); break;
case '\t': fbuffer_append(state->buffer, "\\t", 2); break;
default: {
const char *hexdig = "0123456789abcdef";
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(state->buffer, scratch, 6);
break;
}
}
break;
}
case 3: {
if (state->ptr[2] & 1) {
fbuffer_append(state->buffer, "\\u2029", 6);
} else {
fbuffer_append(state->buffer, "\\u2028", 6);
}
break;
}
}
state->cursor = (state->ptr += ch_len);
}

static inline void search_flush(search_state *state)
{
fbuffer_append(state->buffer, state->cursor, state->ptr - state->cursor);
state->cursor = state->ptr;
}

static inline unsigned char search_escape(search_state *state, const unsigned char escape_table[256])
{
while (state->ptr < state->end) {
unsigned char ch = (unsigned char)*state->ptr;
unsigned char ch_len = escape_table[ch];

if (RB_UNLIKELY(ch_len)) {
if (ch_len & ESCAPE_MASK) {
if (RB_UNLIKELY(ch_len == 11)) {
const unsigned char *uptr = (const unsigned char *)state->ptr;
if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) {
state->ptr += 3;
continue;
}
}
search_flush(state);
return ch_len & CHAR_LENGTH_MASK;
} else {
state->ptr += ch_len;
}
} else {
state->ptr++;
}
}
search_flush(state);
return 0;
}

/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
Expand All @@ -183,77 +262,17 @@ static const unsigned char script_safe_escape_table[256] = {
*/
static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *ptr = RSTRING_PTR(str);
unsigned long len = RSTRING_LEN(str);

unsigned long beg = 0, pos = 0;

#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;

while (pos < len) {
unsigned char ch = ptr[pos];
unsigned char ch_len = escape_table[ch];
/* JSON encoding */
long len;
search_state state;
state.buffer = out_buffer;
RSTRING_GETMEM(str, state.ptr, len);
state.cursor = state.ptr;
state.end = state.ptr + len;

if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 9: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default: {
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
break;
}
}
break;
}
case 11: {
unsigned char b2 = ptr[pos + 1];
if (RB_UNLIKELY(b2 == 0x80)) {
unsigned char b3 = ptr[pos + 2];
if (b3 == 0xA8) {
FLUSH_POS(3);
fbuffer_append(out_buffer, "\\u2028", 6);
break;
} else if (b3 == 0xA9) {
FLUSH_POS(3);
fbuffer_append(out_buffer, "\\u2029", 6);
break;
}
}
ch_len = 3;
// fallthrough
}
default:
pos += ch_len;
break;
}
} else {
pos++;
}
}
#undef FLUSH_POS

if (beg < len) {
fbuffer_append(out_buffer, &ptr[beg], len - beg);
unsigned char ch_len;
while ((ch_len = search_escape(&state, escape_table))) {
escape_UTF8_char(&state, ch_len);
}

RB_GC_GUARD(str);
}

static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
Expand Down
6 changes: 6 additions & 0 deletions test/json/json_generator_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,12 @@ def test_nonutf8_encoding
assert_equal("\"5\u{b0}\"", "5\xb0".dup.force_encoding(Encoding::ISO_8859_1).to_json)
end

def test_utf8_multibyte
assert_equal('["foßbar"]', JSON.generate(["foßbar"]))
assert_equal('"n€ßt€ð2"', JSON.generate("n€ßt€ð2"))
assert_equal('"\"\u0000\u001f"', JSON.generate("\"\u0000\u001f"))
end

def test_fragment
fragment = JSON::Fragment.new(" 42")
assert_equal '{"number": 42}', JSON.generate({ number: fragment })
Expand Down

0 comments on commit 99a3aea

Please sign in to comment.